/*
 * Copyright (c) 2018-2019 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include "impl_fp32_fp32.hpp"

namespace depthwise
{

using namespace neon_convolution_kernels;
using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;

#ifdef __aarch64__
template <>
template <>
void Conv::execute_tile<ActivationFunction::None>(
  int n_channels,
  const void *weight_bias_ptr,
  const float *input,
  const unsigned int input_row_stride,
  const unsigned int input_col_stride,
  float *output,
  const unsigned int output_row_stride,
  const unsigned int output_col_stride
)
{
  __asm __volatile(
    "add x20, %[inptr0], %[input_row_stride]\n"
    "add x13, %[input_col_stride1], %[input_col_stride1]\n"
    "add x24, %[outptr0], %[output_row_stride]\n"
    "add x21, x20, %[input_row_stride]\n"
    "add x14, x13, #64\n"
    "add x15, x13, %[input_col_stride1]\n"
    "add x22, x21, %[input_row_stride]\n"
    "add x16, x15, #64\n"
    "add x17, x15, %[input_col_stride1]\n"
    "add x23, x22, %[input_row_stride]\n"
    "add x9, x17, #64\n"
    "add x25, x24, %[output_row_stride]\n"
    "add x26, %[output_col_stride1], %[output_col_stride1]\n"
    "and x27, %[n_channels], #3\n"
    "lsr x28, %[n_channels], #2\n"
    "cbz x28, 4f\n"
    "1:\n"
    "ldr q25, [%[wbptr]]\n"
    "subs x28, x28, #1\n"
    "mov v17.16b, v25.16b\n"
    "ldr q16, [%[wbptr], #16]\n"
    "mov v13.16b, v25.16b\n"
    "ldr q7, [%[wbptr], #32]\n"
    "mov v15.16b, v25.16b\n"
    "ldr q6, [%[wbptr], #48]\n"
    "mov v10.16b, v25.16b\n"
    "ldr q5, [%[wbptr], #64]\n"
    "mov v12.16b, v25.16b\n"
    "ldr q4, [%[wbptr], #80]\n"
    "mov v14.16b, v25.16b\n"
    "ldr q3, [%[wbptr], #96]\n"
    "mov v9.16b, v25.16b\n"
    "ldr q2, [%[wbptr], #112]\n"
    "mov v11.16b, v25.16b\n"
    "ldr q1, [%[wbptr], #128]\n"
    "mov v8.16b, v25.16b\n"
    "ldr q0, [%[wbptr], #144]\n"
    "ldr q26, [%[inptr0]]\n"
    "ldr q28, [x20]\n"
    "fmla v17.4s, v26.4s, v16.4s\n"
    "ldr q29, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v13.4s, v28.4s, v16.4s\n"
    "ldr q27, [x21]\n"
    "fmla v15.4s, v29.4s, v16.4s\n"
    "ldr q21, [x20, %[input_col_stride1]]\n"
    "fmla v17.4s, v28.4s, v5.4s\n"
    "ldr q20, [%[inptr0], x13]\n"
    "ldr q23, [x22]\n"
    "ldr q19, [x21, %[input_col_stride1]]\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "prfm pldl1keep, [x20, #64]\n"
    "fmla v17.4s, v29.4s, v7.4s\n"
    "prfm pldl1keep, [%[inptr0], x19]\n"
    "prfm pldl1keep, [x21, #64]\n"
    "prfm pldl1keep, [x20, x19]\n"
    "prfm pldl1keep, [%[inptr0], x14]\n"
    "prfm pldl1keep, [x22, #64]\n"
    "prfm pldl1keep, [x21, x19]\n"
    "beq 3f\n"
    "2:\n"
    "fmla v17.4s, v27.4s, v2.4s\n"
    "ldr q30, [x20, x13]\n"
    "fmla v13.4s, v27.4s, v5.4s\n"
    "ldr q29, [%[inptr0], x15]\n"
    "fmla v10.4s, v27.4s, v16.4s\n"
    "ldr q28, [x23]\n"
    "fmla v17.4s, v21.4s, v4.4s\n"
    "ldr q24, [x22, %[input_col_stride1]]\n"
    "fmla v13.4s, v21.4s, v7.4s\n"
    "ldr q18, [x21, x13]\n"
    "fmla v15.4s, v21.4s, v5.4s\n"
    "prfm pldl1keep, [x20, x14]\n"
    "fmla v12.4s, v21.4s, v16.4s\n"
    "ldr q22, [x20, x15]\n"
    "fmla v17.4s, v20.4s, v6.4s\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v15.4s, v20.4s, v7.4s\n"
    "prfm pldl1keep, [x23, #64]\n"
    "fmla v14.4s, v20.4s, v16.4s\n"
    "ldr q25, [%[inptr0], x17]\n"
    "fmla v13.4s, v23.4s, v2.4s\n"
    "prfm pldl1keep, [x22, x19]\n"
    "fmla v10.4s, v23.4s, v5.4s\n"
    "ldr q26, [x23, %[input_col_stride1]]\n"
    "fmla v17.4s, v19.4s, v1.4s\n"
    "prfm pldl1keep, [x21, x14]\n"
    "fmla v13.4s, v19.4s, v4.4s\n"
    "prfm pldl1keep, [x20, x16]\n"
    "fmla v15.4s, v19.4s, v2.4s\n"
    "prfm pldl1keep, [%[inptr0], x9]\n"
    "fmla v10.4s, v19.4s, v7.4s\n"
    "prfm pldl1keep, [x23, x19]\n"
    "fmla v12.4s, v19.4s, v5.4s\n"
    "prfm pldl1keep, [x22, x14]\n"
    "fmla v9.4s, v19.4s, v16.4s\n"
    "ldr q27, [x22, x13]\n"
    "fmla v17.4s, v30.4s, v3.4s\n"
    "prfm pldl1keep, [x21, x16]\n"
    "fmla v13.4s, v30.4s, v6.4s\n"
    "prfm pldl1keep, [x20, x9]\n"
    "fmla v15.4s, v30.4s, v4.4s\n"
    "prfm pldl1keep, [x23, x14]\n"
    "fmla v12.4s, v30.4s, v7.4s\n"
    "prfm pldl1keep, [x22, x16]\n"
    "fmla v14.4s, v30.4s, v5.4s\n"
    "prfm pldl1keep, [x21, x9]\n"
    "fmla v11.4s, v30.4s, v16.4s\n"
    "ldr q21, [x21, x15]\n"
    "fmla v15.4s, v29.4s, v6.4s\n"
    "prfm pldl1keep, [x23, x16]\n"
    "fmla v14.4s, v29.4s, v7.4s\n"
    "ldr q20, [x20, x17]\n"
    "fmla v10.4s, v28.4s, v2.4s\n"
    "ldr q19, [x23, x13]\n"
    "fmla v13.4s, v24.4s, v1.4s\n"
    "prfm pldl1keep, [x22, x9]\n"
    "fmla v12.4s, v24.4s, v2.4s\n"
    "prfm pldl1keep, [x23, x9]\n"
    "fmla v10.4s, v24.4s, v4.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v9.4s, v24.4s, v5.4s\n"
    "ldr q23, [x22, x15]\n"
    "fmla v17.4s, v18.4s, v0.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v13.4s, v18.4s, v3.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v15.4s, v18.4s, v1.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "str q17, [%[outptr0]]\n"
    "fmla v10.4s, v18.4s, v6.4s\n"
    "fmla v12.4s, v18.4s, v4.4s\n"
    "ldr q17, [x21, x17]\n"
    "fmla v14.4s, v18.4s, v2.4s\n"
    "prfm pldl1keep, [%[inptr0], x19]\n"
    "fmla v9.4s, v18.4s, v7.4s\n"
    "prfm pldl1keep, [%[inptr0], x14]\n"
    "fmla v11.4s, v18.4s, v5.4s\n"
    "add x20, x20, #16\n"
    "fmla v8.4s, v18.4s, v16.4s\n"
    "ldr q24, [x23, x15]\n"
    "fmla v15.4s, v22.4s, v3.4s\n"
    "ldr q18, [x22, x17]\n"
    "fmla v12.4s, v22.4s, v6.4s\n"
    "prfm pldl1keep, [x20, #64]\n"
    "fmla v14.4s, v22.4s, v4.4s\n"
    "prfm pldl1keep, [x20, x19]\n"
    "fmla v11.4s, v22.4s, v7.4s\n"
    "ldr q22, [x23, x17]\n"
    "fmla v10.4s, v26.4s, v1.4s\n"
    "add x21, x21, #16\n"
    "fmla v14.4s, v25.4s, v6.4s\n"
    "ldr q25, [%[wbptr]]\n"
    "fmla v9.4s, v26.4s, v2.4s\n"
    "ldr q16, [%[wbptr], #16]\n"
    "fmla v13.4s, v27.4s, v0.4s\n"
    "prfm pldl1keep, [x21, #64]\n"
    "fmla v10.4s, v27.4s, v3.4s\n"
    "prfm pldl1keep, [x21, x19]\n"
    "fmla v12.4s, v27.4s, v1.4s\n"
    "add x22, x22, #16\n"
    "str q13, [x24]\n"
    "fmla v9.4s, v27.4s, v4.4s\n"
    "fmla v11.4s, v27.4s, v2.4s\n"
    "ldr q26, [%[inptr0]]\n"
    "fmla v8.4s, v27.4s, v5.4s\n"
    "ldr q28, [x20]\n"
    "fmla v15.4s, v21.4s, v0.4s\n"
    "ldr q29, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v12.4s, v21.4s, v3.4s\n"
    "prfm pldl1keep, [x22, #64]\n"
    "fmla v14.4s, v21.4s, v1.4s\n"
    "add x23, x23, #16\n"
    "str q15, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v9.4s, v21.4s, v6.4s\n"
    "fmla v11.4s, v21.4s, v4.4s\n"
    "ldr q5, [%[wbptr], #64]\n"
    "fmla v8.4s, v21.4s, v7.4s\n"
    "ldr q27, [x21]\n"
    "fmla v14.4s, v20.4s, v3.4s\n"
    "ldr q21, [x20, %[input_col_stride1]]\n"
    "fmla v11.4s, v20.4s, v6.4s\n"
    "ldr q20, [%[inptr0], x13]\n"
    "fmla v10.4s, v19.4s, v0.4s\n"
    "subs x28, x28, #1\n"
    "fmla v9.4s, v19.4s, v1.4s\n"
    "fmla v8.4s, v19.4s, v2.4s\n"
    "fmla v12.4s, v23.4s, v0.4s\n"
    "ldr q7, [%[wbptr], #32]\n"
    "str q10, [x25]\n"
    "fmla v11.4s, v23.4s, v1.4s\n"
    "fmla v9.4s, v23.4s, v3.4s\n"
    "ldr q2, [%[wbptr], #112]\n"
    "str q12, [x24, %[output_col_stride1]]\n"
    "fmla v8.4s, v23.4s, v4.4s\n"
    "fmla v14.4s, v17.4s, v0.4s\n"
    "ldr q23, [x22]\n"
    "fmla v11.4s, v17.4s, v3.4s\n"
    "ldr q19, [x21, %[input_col_stride1]]\n"
    "fmla v8.4s, v17.4s, v6.4s\n"
    "ldr q4, [%[wbptr], #80]\n"
    "str q14, [%[outptr0], x26]\n"
    "fmla v9.4s, v24.4s, v0.4s\n"
    "fmla v11.4s, v18.4s, v0.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "fmla v8.4s, v24.4s, v1.4s\n"
    "ldr q6, [%[wbptr], #48]\n"
    "str q9, [x25, %[output_col_stride1]]\n"
    "mov v17.16b, v25.16b\n"
    "str q11, [x24, x26]\n"
    "mov v13.16b, v25.16b\n"
    "fmla v8.4s, v18.4s, v3.4s\n"
    "ldr q1, [%[wbptr], #128]\n"
    "mov v15.16b, v25.16b\n"
    "add x24, x24, #16\n"
    "mov v10.16b, v25.16b\n"
    "mov v12.16b, v25.16b\n"
    "fmla v8.4s, v22.4s, v0.4s\n"
    "ldr q3, [%[wbptr], #96]\n"
    "mov v14.16b, v25.16b\n"
    "mov v9.16b, v25.16b\n"
    "mov v11.16b, v25.16b\n"
    "fmla v17.4s, v26.4s, v16.4s\n"
    "str q8, [x25, x26]\n"
    "fmla v13.4s, v28.4s, v16.4s\n"
    "mov v8.16b, v25.16b\n"
    "ldr q0, [%[wbptr], #144]\n"
    "fmla v17.4s, v28.4s, v5.4s\n"
    "fmla v15.4s, v29.4s, v16.4s\n"
    "add x25, x25, #16\n"
    "fmla v17.4s, v29.4s, v7.4s\n"
    "bne 2b\n"
    "3:\n"
    "fmla v17.4s, v27.4s, v2.4s\n"
    "ldr q30, [x20, x13]\n"
    "fmla v13.4s, v27.4s, v5.4s\n"
    "ldr q29, [%[inptr0], x15]\n"
    "fmla v10.4s, v27.4s, v16.4s\n"
    "ldr q28, [x23]\n"
    "fmla v17.4s, v21.4s, v4.4s\n"
    "ldr q24, [x22, %[input_col_stride1]]\n"
    "fmla v13.4s, v21.4s, v7.4s\n"
    "ldr q18, [x21, x13]\n"
    "fmla v15.4s, v21.4s, v5.4s\n"
    "prfm pldl1keep, [x20, x14]\n"
    "fmla v12.4s, v21.4s, v16.4s\n"
    "ldr q22, [x20, x15]\n"
    "fmla v17.4s, v20.4s, v6.4s\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v15.4s, v20.4s, v7.4s\n"
    "prfm pldl1keep, [x23, #64]\n"
    "fmla v14.4s, v20.4s, v16.4s\n"
    "ldr q25, [%[inptr0], x17]\n"
    "fmla v13.4s, v23.4s, v2.4s\n"
    "prfm pldl1keep, [x22, x19]\n"
    "fmla v10.4s, v23.4s, v5.4s\n"
    "ldr q26, [x23, %[input_col_stride1]]\n"
    "fmla v17.4s, v19.4s, v1.4s\n"
    "prfm pldl1keep, [x21, x14]\n"
    "fmla v13.4s, v19.4s, v4.4s\n"
    "prfm pldl1keep, [x20, x16]\n"
    "fmla v15.4s, v19.4s, v2.4s\n"
    "prfm pldl1keep, [%[inptr0], x9]\n"
    "fmla v10.4s, v19.4s, v7.4s\n"
    "prfm pldl1keep, [x23, x19]\n"
    "fmla v12.4s, v19.4s, v5.4s\n"
    "prfm pldl1keep, [x22, x14]\n"
    "fmla v9.4s, v19.4s, v16.4s\n"
    "ldr q27, [x22, x13]\n"
    "fmla v17.4s, v30.4s, v3.4s\n"
    "prfm pldl1keep, [x21, x16]\n"
    "fmla v13.4s, v30.4s, v6.4s\n"
    "prfm pldl1keep, [x20, x9]\n"
    "fmla v15.4s, v30.4s, v4.4s\n"
    "prfm pldl1keep, [x23, x14]\n"
    "fmla v12.4s, v30.4s, v7.4s\n"
    "prfm pldl1keep, [x22, x16]\n"
    "fmla v14.4s, v30.4s, v5.4s\n"
    "prfm pldl1keep, [x21, x9]\n"
    "fmla v11.4s, v30.4s, v16.4s\n"
    "ldr q21, [x21, x15]\n"
    "fmla v15.4s, v29.4s, v6.4s\n"
    "prfm pldl1keep, [x23, x16]\n"
    "fmla v14.4s, v29.4s, v7.4s\n"
    "ldr q20, [x20, x17]\n"
    "fmla v10.4s, v28.4s, v2.4s\n"
    "ldr q19, [x23, x13]\n"
    "fmla v13.4s, v24.4s, v1.4s\n"
    "prfm pldl1keep, [x22, x9]\n"
    "fmla v12.4s, v24.4s, v2.4s\n"
    "prfm pldl1keep, [x23, x9]\n"
    "fmla v10.4s, v24.4s, v4.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v9.4s, v24.4s, v5.4s\n"
    "ldr q23, [x22, x15]\n"
    "fmla v17.4s, v18.4s, v0.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v13.4s, v18.4s, v3.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v15.4s, v18.4s, v1.4s\n"
    "add x20, x20, #16\n"
    "str q17, [%[outptr0]]\n"
    "fmla v10.4s, v18.4s, v6.4s\n"
    "fmla v12.4s, v18.4s, v4.4s\n"
    "ldr q17, [x21, x17]\n"
    "fmla v14.4s, v18.4s, v2.4s\n"
    "add x21, x21, #16\n"
    "fmla v9.4s, v18.4s, v7.4s\n"
    "fmla v11.4s, v18.4s, v5.4s\n"
    "fmla v8.4s, v18.4s, v16.4s\n"
    "ldr q24, [x23, x15]\n"
    "fmla v15.4s, v22.4s, v3.4s\n"
    "ldr q18, [x22, x17]\n"
    "fmla v12.4s, v22.4s, v6.4s\n"
    "add x22, x22, #16\n"
    "fmla v14.4s, v22.4s, v4.4s\n"
    "fmla v11.4s, v22.4s, v7.4s\n"
    "fmla v10.4s, v26.4s, v1.4s\n"
    "ldr q22, [x23, x17]\n"
    "fmla v9.4s, v26.4s, v2.4s\n"
    "add x23, x23, #16\n"
    "fmla v14.4s, v25.4s, v6.4s\n"
    "fmla v13.4s, v27.4s, v0.4s\n"
    "fmla v10.4s, v27.4s, v3.4s\n"
    "fmla v12.4s, v27.4s, v1.4s\n"
    "fmla v9.4s, v27.4s, v4.4s\n"
    "fmla v11.4s, v27.4s, v2.4s\n"
    "str q13, [x24]\n"
    "fmla v8.4s, v27.4s, v5.4s\n"
    "fmla v15.4s, v21.4s, v0.4s\n"
    "fmla v12.4s, v21.4s, v3.4s\n"
    "fmla v14.4s, v21.4s, v1.4s\n"
    "fmla v9.4s, v21.4s, v6.4s\n"
    "fmla v11.4s, v21.4s, v4.4s\n"
    "fmla v8.4s, v21.4s, v7.4s\n"
    "str q15, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v10.4s, v19.4s, v0.4s\n"
    "fmla v14.4s, v20.4s, v3.4s\n"
    "fmla v9.4s, v19.4s, v1.4s\n"
    "fmla v11.4s, v20.4s, v6.4s\n"
    "fmla v8.4s, v19.4s, v2.4s\n"
    "str q10, [x25]\n"
    "fmla v12.4s, v23.4s, v0.4s\n"
    "fmla v9.4s, v23.4s, v3.4s\n"
    "fmla v14.4s, v17.4s, v0.4s\n"
    "fmla v11.4s, v23.4s, v1.4s\n"
    "fmla v8.4s, v23.4s, v4.4s\n"
    "str q12, [x24, %[output_col_stride1]]\n"
    "fmla v9.4s, v24.4s, v0.4s\n"
    "str q14, [%[outptr0], x26]\n"
    "fmla v11.4s, v17.4s, v3.4s\n"
    "fmla v8.4s, v17.4s, v6.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "str q9, [x25, %[output_col_stride1]]\n"
    "fmla v11.4s, v18.4s, v0.4s\n"
    "fmla v8.4s, v24.4s, v1.4s\n"
    "str q11, [x24, x26]\n"
    "fmla v8.4s, v18.4s, v3.4s\n"
    "add x24, x24, #16\n"
    "fmla v8.4s, v22.4s, v0.4s\n"
    "str q8, [x25, x26]\n"
    "add x25, x25, #16\n"
    "4:\n"
    "cbz x27, 7f\n"
    "ldr s25, [%[wbptr]]\n"
    "mov v17.16b, v25.16b\n"
    "ldr s16, [%[wbptr], #4]\n"
    "mov v13.16b, v25.16b\n"
    "ldr s7, [%[wbptr], #8]\n"
    "mov v15.16b, v25.16b\n"
    "ldr s6, [%[wbptr], #12]\n"
    "mov v10.16b, v25.16b\n"
    "ldr s5, [%[wbptr], #16]\n"
    "mov v12.16b, v25.16b\n"
    "ldr s4, [%[wbptr], #20]\n"
    "mov v14.16b, v25.16b\n"
    "ldr s3, [%[wbptr], #24]\n"
    "mov v9.16b, v25.16b\n"
    "ldr s2, [%[wbptr], #28]\n"
    "mov v11.16b, v25.16b\n"
    "ldr s1, [%[wbptr], #32]\n"
    "mov v8.16b, v25.16b\n"
    "ldr s0, [%[wbptr], #36]\n"
    "ldr s26, [%[inptr0]]\n"
    "subs x27, x27, #1\n"
    "fmla v17.4s, v26.4s, v16.4s\n"
    "ldr s28, [x20]\n"
    "fmla v13.4s, v28.4s, v16.4s\n"
    "ldr s29, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v15.4s, v29.4s, v16.4s\n"
    "ldr s27, [x21]\n"
    "fmla v17.4s, v28.4s, v5.4s\n"
    "ldr s21, [x20, %[input_col_stride1]]\n"
    "ldr s20, [%[inptr0], x13]\n"
    "ldr s23, [x22]\n"
    "ldr s19, [x21, %[input_col_stride1]]\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "fmla v17.4s, v29.4s, v7.4s\n"
    "prfm pldl1keep, [x20, #64]\n"
    "prfm pldl1keep, [%[inptr0], x19]\n"
    "prfm pldl1keep, [x21, #64]\n"
    "prfm pldl1keep, [x20, x19]\n"
    "prfm pldl1keep, [%[inptr0], x14]\n"
    "prfm pldl1keep, [x22, #64]\n"
    "prfm pldl1keep, [x21, x19]\n"
    "beq 6f\n"
    "5:\n"
    "fmla v17.4s, v27.4s, v2.4s\n"
    "ldr s30, [x20, x13]\n"
    "fmla v13.4s, v27.4s, v5.4s\n"
    "ldr s29, [%[inptr0], x15]\n"
    "fmla v10.4s, v27.4s, v16.4s\n"
    "ldr s28, [x23]\n"
    "fmla v17.4s, v21.4s, v4.4s\n"
    "ldr s24, [x22, %[input_col_stride1]]\n"
    "fmla v13.4s, v21.4s, v7.4s\n"
    "ldr s18, [x21, x13]\n"
    "fmla v15.4s, v21.4s, v5.4s\n"
    "prfm pldl1keep, [x20, x14]\n"
    "fmla v12.4s, v21.4s, v16.4s\n"
    "ldr s22, [x20, x15]\n"
    "fmla v17.4s, v20.4s, v6.4s\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v15.4s, v20.4s, v7.4s\n"
    "prfm pldl1keep, [x23, #64]\n"
    "fmla v14.4s, v20.4s, v16.4s\n"
    "ldr s25, [%[inptr0], x17]\n"
    "fmla v13.4s, v23.4s, v2.4s\n"
    "prfm pldl1keep, [x22, x19]\n"
    "fmla v10.4s, v23.4s, v5.4s\n"
    "ldr s26, [x23, %[input_col_stride1]]\n"
    "fmla v17.4s, v19.4s, v1.4s\n"
    "prfm pldl1keep, [x21, x14]\n"
    "fmla v13.4s, v19.4s, v4.4s\n"
    "prfm pldl1keep, [x20, x16]\n"
    "fmla v15.4s, v19.4s, v2.4s\n"
    "prfm pldl1keep, [%[inptr0], x9]\n"
    "fmla v10.4s, v19.4s, v7.4s\n"
    "prfm pldl1keep, [x23, x19]\n"
    "fmla v12.4s, v19.4s, v5.4s\n"
    "prfm pldl1keep, [x22, x14]\n"
    "fmla v9.4s, v19.4s, v16.4s\n"
    "ldr s27, [x22, x13]\n"
    "fmla v17.4s, v30.4s, v3.4s\n"
    "prfm pldl1keep, [x21, x16]\n"
    "fmla v13.4s, v30.4s, v6.4s\n"
    "prfm pldl1keep, [x20, x9]\n"
    "fmla v15.4s, v30.4s, v4.4s\n"
    "prfm pldl1keep, [x23, x14]\n"
    "fmla v12.4s, v30.4s, v7.4s\n"
    "prfm pldl1keep, [x22, x16]\n"
    "fmla v14.4s, v30.4s, v5.4s\n"
    "prfm pldl1keep, [x21, x9]\n"
    "fmla v11.4s, v30.4s, v16.4s\n"
    "ldr s21, [x21, x15]\n"
    "fmla v15.4s, v29.4s, v6.4s\n"
    "prfm pldl1keep, [x23, x16]\n"
    "fmla v14.4s, v29.4s, v7.4s\n"
    "ldr s20, [x20, x17]\n"
    "fmla v10.4s, v28.4s, v2.4s\n"
    "ldr s19, [x23, x13]\n"
    "fmla v13.4s, v24.4s, v1.4s\n"
    "prfm pldl1keep, [x22, x9]\n"
    "fmla v12.4s, v24.4s, v2.4s\n"
    "prfm pldl1keep, [x23, x9]\n"
    "fmla v10.4s, v24.4s, v4.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v9.4s, v24.4s, v5.4s\n"
    "ldr s23, [x22, x15]\n"
    "fmla v17.4s, v18.4s, v0.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v13.4s, v18.4s, v3.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v15.4s, v18.4s, v1.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "str s17, [%[outptr0]]\n"
    "fmla v10.4s, v18.4s, v6.4s\n"
    "fmla v12.4s, v18.4s, v4.4s\n"
    "ldr s17, [x21, x17]\n"
    "fmla v14.4s, v18.4s, v2.4s\n"
    "prfm pldl1keep, [%[inptr0], x19]\n"
    "fmla v9.4s, v18.4s, v7.4s\n"
    "prfm pldl1keep, [%[inptr0], x14]\n"
    "fmla v11.4s, v18.4s, v5.4s\n"
    "add x20, x20, #4\n"
    "fmla v8.4s, v18.4s, v16.4s\n"
    "ldr s24, [x23, x15]\n"
    "fmla v15.4s, v22.4s, v3.4s\n"
    "ldr s18, [x22, x17]\n"
    "fmla v12.4s, v22.4s, v6.4s\n"
    "prfm pldl1keep, [x20, #64]\n"
    "fmla v14.4s, v22.4s, v4.4s\n"
    "prfm pldl1keep, [x20, x19]\n"
    "fmla v11.4s, v22.4s, v7.4s\n"
    "ldr s22, [x23, x17]\n"
    "fmla v10.4s, v26.4s, v1.4s\n"
    "add x21, x21, #4\n"
    "fmla v14.4s, v25.4s, v6.4s\n"
    "ldr s25, [%[wbptr]]\n"
    "fmla v9.4s, v26.4s, v2.4s\n"
    "ldr s16, [%[wbptr], #4]\n"
    "fmla v13.4s, v27.4s, v0.4s\n"
    "prfm pldl1keep, [x21, #64]\n"
    "fmla v10.4s, v27.4s, v3.4s\n"
    "prfm pldl1keep, [x21, x19]\n"
    "fmla v12.4s, v27.4s, v1.4s\n"
    "add x22, x22, #4\n"
    "str s13, [x24]\n"
    "fmla v9.4s, v27.4s, v4.4s\n"
    "fmla v11.4s, v27.4s, v2.4s\n"
    "ldr s26, [%[inptr0]]\n"
    "fmla v8.4s, v27.4s, v5.4s\n"
    "ldr s28, [x20]\n"
    "fmla v15.4s, v21.4s, v0.4s\n"
    "ldr s29, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v12.4s, v21.4s, v3.4s\n"
    "prfm pldl1keep, [x22, #64]\n"
    "fmla v14.4s, v21.4s, v1.4s\n"
    "add x23, x23, #4\n"
    "str s15, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v9.4s, v21.4s, v6.4s\n"
    "fmla v11.4s, v21.4s, v4.4s\n"
    "ldr s5, [%[wbptr], #16]\n"
    "fmla v8.4s, v21.4s, v7.4s\n"
    "ldr s27, [x21]\n"
    "fmla v14.4s, v20.4s, v3.4s\n"
    "ldr s21, [x20, %[input_col_stride1]]\n"
    "fmla v11.4s, v20.4s, v6.4s\n"
    "ldr s20, [%[inptr0], x13]\n"
    "fmla v10.4s, v19.4s, v0.4s\n"
    "subs x27, x27, #1\n"
    "fmla v9.4s, v19.4s, v1.4s\n"
    "fmla v8.4s, v19.4s, v2.4s\n"
    "fmla v12.4s, v23.4s, v0.4s\n"
    "ldr s7, [%[wbptr], #8]\n"
    "str s10, [x25]\n"
    "fmla v11.4s, v23.4s, v1.4s\n"
    "fmla v9.4s, v23.4s, v3.4s\n"
    "ldr s2, [%[wbptr], #28]\n"
    "str s12, [x24, %[output_col_stride1]]\n"
    "fmla v8.4s, v23.4s, v4.4s\n"
    "fmla v14.4s, v17.4s, v0.4s\n"
    "ldr s23, [x22]\n"
    "fmla v11.4s, v17.4s, v3.4s\n"
    "ldr s19, [x21, %[input_col_stride1]]\n"
    "fmla v8.4s, v17.4s, v6.4s\n"
    "ldr s4, [%[wbptr], #20]\n"
    "str s14, [%[outptr0], x26]\n"
    "fmla v9.4s, v24.4s, v0.4s\n"
    "fmla v11.4s, v18.4s, v0.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "fmla v8.4s, v24.4s, v1.4s\n"
    "ldr s6, [%[wbptr], #12]\n"
    "str s9, [x25, %[output_col_stride1]]\n"
    "mov v17.16b, v25.16b\n"
    "str s11, [x24, x26]\n"
    "mov v13.16b, v25.16b\n"
    "fmla v8.4s, v18.4s, v3.4s\n"
    "ldr s1, [%[wbptr], #32]\n"
    "mov v15.16b, v25.16b\n"
    "add x24, x24, #4\n"
    "mov v10.16b, v25.16b\n"
    "mov v12.16b, v25.16b\n"
    "fmla v8.4s, v22.4s, v0.4s\n"
    "ldr s3, [%[wbptr], #24]\n"
    "mov v14.16b, v25.16b\n"
    "mov v9.16b, v25.16b\n"
    "mov v11.16b, v25.16b\n"
    "fmla v17.4s, v26.4s, v16.4s\n"
    "str s8, [x25, x26]\n"
    "fmla v13.4s, v28.4s, v16.4s\n"
    "mov v8.16b, v25.16b\n"
    "ldr s0, [%[wbptr], #36]\n"
    "fmla v17.4s, v28.4s, v5.4s\n"
    "fmla v15.4s, v29.4s, v16.4s\n"
    "add x25, x25, #4\n"
    "fmla v17.4s, v29.4s, v7.4s\n"
    "bne 5b\n"
    "6:\n"
    "fmla v17.4s, v27.4s, v2.4s\n"
    "ldr s30, [x20, x13]\n"
    "fmla v13.4s, v27.4s, v5.4s\n"
    "ldr s29, [%[inptr0], x15]\n"
    "fmla v10.4s, v27.4s, v16.4s\n"
    "ldr s28, [x23]\n"
    "fmla v17.4s, v21.4s, v4.4s\n"
    "ldr s24, [x22, %[input_col_stride1]]\n"
    "fmla v13.4s, v21.4s, v7.4s\n"
    "ldr s18, [x21, x13]\n"
    "fmla v15.4s, v21.4s, v5.4s\n"
    "prfm pldl1keep, [x20, x14]\n"
    "fmla v12.4s, v21.4s, v16.4s\n"
    "ldr s22, [x20, x15]\n"
    "fmla v17.4s, v20.4s, v6.4s\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v15.4s, v20.4s, v7.4s\n"
    "prfm pldl1keep, [x23, #64]\n"
    "fmla v14.4s, v20.4s, v16.4s\n"
    "ldr s25, [%[inptr0], x17]\n"
    "fmla v13.4s, v23.4s, v2.4s\n"
    "prfm pldl1keep, [x22, x19]\n"
    "fmla v10.4s, v23.4s, v5.4s\n"
    "ldr s26, [x23, %[input_col_stride1]]\n"
    "fmla v17.4s, v19.4s, v1.4s\n"
    "prfm pldl1keep, [x21, x14]\n"
    "fmla v13.4s, v19.4s, v4.4s\n"
    "prfm pldl1keep, [x20, x16]\n"
    "fmla v15.4s, v19.4s, v2.4s\n"
    "prfm pldl1keep, [%[inptr0], x9]\n"
    "fmla v10.4s, v19.4s, v7.4s\n"
    "prfm pldl1keep, [x23, x19]\n"
    "fmla v12.4s, v19.4s, v5.4s\n"
    "prfm pldl1keep, [x22, x14]\n"
    "fmla v9.4s, v19.4s, v16.4s\n"
    "ldr s27, [x22, x13]\n"
    "fmla v17.4s, v30.4s, v3.4s\n"
    "prfm pldl1keep, [x21, x16]\n"
    "fmla v13.4s, v30.4s, v6.4s\n"
    "prfm pldl1keep, [x20, x9]\n"
    "fmla v15.4s, v30.4s, v4.4s\n"
    "prfm pldl1keep, [x23, x14]\n"
    "fmla v12.4s, v30.4s, v7.4s\n"
    "prfm pldl1keep, [x22, x16]\n"
    "fmla v14.4s, v30.4s, v5.4s\n"
    "prfm pldl1keep, [x21, x9]\n"
    "fmla v11.4s, v30.4s, v16.4s\n"
    "ldr s21, [x21, x15]\n"
    "fmla v15.4s, v29.4s, v6.4s\n"
    "prfm pldl1keep, [x23, x16]\n"
    "fmla v14.4s, v29.4s, v7.4s\n"
    "ldr s20, [x20, x17]\n"
    "fmla v10.4s, v28.4s, v2.4s\n"
    "ldr s19, [x23, x13]\n"
    "fmla v13.4s, v24.4s, v1.4s\n"
    "prfm pldl1keep, [x22, x9]\n"
    "fmla v12.4s, v24.4s, v2.4s\n"
    "prfm pldl1keep, [x23, x9]\n"
    "fmla v10.4s, v24.4s, v4.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v9.4s, v24.4s, v5.4s\n"
    "ldr s23, [x22, x15]\n"
    "fmla v17.4s, v18.4s, v0.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v13.4s, v18.4s, v3.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v15.4s, v18.4s, v1.4s\n"
    "add x20, x20, #4\n"
    "str s17, [%[outptr0]]\n"
    "fmla v10.4s, v18.4s, v6.4s\n"
    "fmla v12.4s, v18.4s, v4.4s\n"
    "ldr s17, [x21, x17]\n"
    "fmla v14.4s, v18.4s, v2.4s\n"
    "add x21, x21, #4\n"
    "fmla v9.4s, v18.4s, v7.4s\n"
    "fmla v11.4s, v18.4s, v5.4s\n"
    "fmla v8.4s, v18.4s, v16.4s\n"
    "ldr s24, [x23, x15]\n"
    "fmla v15.4s, v22.4s, v3.4s\n"
    "ldr s18, [x22, x17]\n"
    "fmla v12.4s, v22.4s, v6.4s\n"
    "add x22, x22, #4\n"
    "fmla v14.4s, v22.4s, v4.4s\n"
    "fmla v11.4s, v22.4s, v7.4s\n"
    "fmla v10.4s, v26.4s, v1.4s\n"
    "ldr s22, [x23, x17]\n"
    "fmla v9.4s, v26.4s, v2.4s\n"
    "add x23, x23, #4\n"
    "fmla v14.4s, v25.4s, v6.4s\n"
    "fmla v13.4s, v27.4s, v0.4s\n"
    "fmla v10.4s, v27.4s, v3.4s\n"
    "fmla v12.4s, v27.4s, v1.4s\n"
    "fmla v9.4s, v27.4s, v4.4s\n"
    "fmla v11.4s, v27.4s, v2.4s\n"
    "str s13, [x24]\n"
    "fmla v8.4s, v27.4s, v5.4s\n"
    "fmla v15.4s, v21.4s, v0.4s\n"
    "fmla v12.4s, v21.4s, v3.4s\n"
    "fmla v14.4s, v21.4s, v1.4s\n"
    "fmla v9.4s, v21.4s, v6.4s\n"
    "fmla v11.4s, v21.4s, v4.4s\n"
    "fmla v8.4s, v21.4s, v7.4s\n"
    "str s15, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v10.4s, v19.4s, v0.4s\n"
    "fmla v14.4s, v20.4s, v3.4s\n"
    "fmla v9.4s, v19.4s, v1.4s\n"
    "fmla v11.4s, v20.4s, v6.4s\n"
    "fmla v8.4s, v19.4s, v2.4s\n"
    "str s10, [x25]\n"
    "fmla v12.4s, v23.4s, v0.4s\n"
    "fmla v9.4s, v23.4s, v3.4s\n"
    "fmla v14.4s, v17.4s, v0.4s\n"
    "fmla v11.4s, v23.4s, v1.4s\n"
    "fmla v8.4s, v23.4s, v4.4s\n"
    "str s12, [x24, %[output_col_stride1]]\n"
    "fmla v9.4s, v24.4s, v0.4s\n"
    "str s14, [%[outptr0], x26]\n"
    "fmla v11.4s, v17.4s, v3.4s\n"
    "fmla v8.4s, v17.4s, v6.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "str s9, [x25, %[output_col_stride1]]\n"
    "fmla v11.4s, v18.4s, v0.4s\n"
    "fmla v8.4s, v24.4s, v1.4s\n"
    "str s11, [x24, x26]\n"
    "fmla v8.4s, v18.4s, v3.4s\n"
    "add x24, x24, #4\n"
    "fmla v8.4s, v22.4s, v0.4s\n"
    "str s8, [x25, x26]\n"
    "add x25, x25, #4\n"
    "7:\n"
    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
  );
}

template <>
template <>
void Conv::execute_tile<ActivationFunction::ReLU>(
  int n_channels,
  const void *weight_bias_ptr,
  const float *input,
  const unsigned int input_row_stride,
  const unsigned int input_col_stride,
  float *output,
  const unsigned int output_row_stride,
  const unsigned int output_col_stride
)
{
  __asm __volatile(
    "add x25, %[inptr0], %[input_row_stride]\n"
    "add x16, %[input_col_stride1], %[input_col_stride1]\n"
    "add x21, %[outptr0], %[output_row_stride]\n"
    "add x22, x25, %[input_row_stride]\n"
    "add x23, x16, #64\n"
    "add x26, x16, %[input_col_stride1]\n"
    "add x13, x22, %[input_row_stride]\n"
    "add x20, x26, #64\n"
    "add x9, x26, %[input_col_stride1]\n"
    "add x24, x13, %[input_row_stride]\n"
    "add x15, x9, #64\n"
    "add x14, x21, %[output_row_stride]\n"
    "add x19, %[output_col_stride1], %[output_col_stride1]\n"
    "and x27, %[n_channels], #3\n"
    "lsr x28, %[n_channels], #2\n"
    "cbz x28, 4f\n"
    "1:\n"
    "ldr q20, [%[wbptr]]\n"
    "subs x28, x28, #1\n"
    "mov v4.16b, v20.16b\n"
    "ldr q15, [%[wbptr], #16]\n"
    "mov v1.16b, v20.16b\n"
    "ldr q0, [%[wbptr], #32]\n"
    "mov v3.16b, v20.16b\n"
    "ldr q13, [%[wbptr], #48]\n"
    "mov v7.16b, v20.16b\n"
    "ldr q16, [%[wbptr], #64]\n"
    "mov v9.16b, v20.16b\n"
    "ldr q12, [%[wbptr], #80]\n"
    "mov v2.16b, v20.16b\n"
    "ldr q17, [%[wbptr], #96]\n"
    "mov v6.16b, v20.16b\n"
    "ldr q11, [%[wbptr], #112]\n"
    "mov v8.16b, v20.16b\n"
    "ldr q10, [%[wbptr], #128]\n"
    "mov v5.16b, v20.16b\n"
    "ldr q14, [%[wbptr], #144]\n"
    "ldr q27, [%[inptr0]]\n"
    "ldr q24, [x25]\n"
    "fmla v4.4s, v27.4s, v15.4s\n"
    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
    "ldr q21, [x22]\n"
    "ldr q19, [x25, %[input_col_stride1]]\n"
    "ldr q31, [%[inptr0], x16]\n"
    "ldr q28, [x13]\n"
    "fmla v4.4s, v24.4s, v16.4s\n"
    "ldr q18, [x22, %[input_col_stride1]]\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "prfm pldl1keep, [x25, #64]\n"
    "prfm pldl1keep, [%[inptr0], x17]\n"
    "prfm pldl1keep, [x22, #64]\n"
    "prfm pldl1keep, [x25, x17]\n"
    "prfm pldl1keep, [%[inptr0], x23]\n"
    "prfm pldl1keep, [x13, #64]\n"
    "prfm pldl1keep, [x22, x17]\n"
    "beq 3f\n"
    "2:\n"
    "fmla v1.4s, v24.4s, v15.4s\n"
    "ldr q24, [x25, x16]\n"
    "fmla v4.4s, v22.4s, v0.4s\n"
    "ldr q29, [%[inptr0], x26]\n"
    "fmla v3.4s, v22.4s, v15.4s\n"
    "ldr q30, [x24]\n"
    "fmla v1.4s, v21.4s, v16.4s\n"
    "ldr q25, [x13, %[input_col_stride1]]\n"
    "fmla v4.4s, v21.4s, v11.4s\n"
    "prfm pldl1keep, [x25, x23]\n"
    "fmla v7.4s, v21.4s, v15.4s\n"
    "ldr q26, [x22, x16]\n"
    "fmla v1.4s, v19.4s, v0.4s\n"
    "prfm pldl1keep, [%[inptr0], x20]\n"
    "fmla v4.4s, v19.4s, v12.4s\n"
    "prfm pldl1keep, [x24, #64]\n"
    "fmla v3.4s, v19.4s, v16.4s\n"
    "prfm pldl1keep, [x13, x17]\n"
    "fmla v9.4s, v19.4s, v15.4s\n"
    "ldr q23, [x25, x26]\n"
    "fmla v4.4s, v31.4s, v13.4s\n"
    "prfm pldl1keep, [x22, x23]\n"
    "fmla v3.4s, v31.4s, v0.4s\n"
    "prfm pldl1keep, [x25, x20]\n"
    "fmla v2.4s, v31.4s, v15.4s\n"
    "ldr q20, [%[inptr0], x9]\n"
    "fmla v1.4s, v28.4s, v11.4s\n"
    "prfm pldl1keep, [%[inptr0], x15]\n"
    "fmla v7.4s, v28.4s, v16.4s\n"
    "ldr q28, [x24, %[input_col_stride1]]\n"
    "fmla v4.4s, v18.4s, v10.4s\n"
    "prfm pldl1keep, [x24, x17]\n"
    "fmla v1.4s, v18.4s, v12.4s\n"
    "prfm pldl1keep, [x13, x23]\n"
    "fmla v3.4s, v18.4s, v11.4s\n"
    "prfm pldl1keep, [x22, x20]\n"
    "fmla v7.4s, v18.4s, v0.4s\n"
    "prfm pldl1keep, [x25, x15]\n"
    "fmla v9.4s, v18.4s, v16.4s\n"
    "prfm pldl1keep, [x24, x23]\n"
    "fmla v6.4s, v18.4s, v15.4s\n"
    "ldr q27, [x13, x16]\n"
    "fmla v4.4s, v24.4s, v17.4s\n"
    "prfm pldl1keep, [x13, x20]\n"
    "fmla v1.4s, v24.4s, v13.4s\n"
    "prfm pldl1keep, [x22, x15]\n"
    "fmla v3.4s, v24.4s, v12.4s\n"
    "prfm pldl1keep, [x24, x20]\n"
    "fmla v9.4s, v24.4s, v0.4s\n"
    "prfm pldl1keep, [x13, x15]\n"
    "fmla v2.4s, v24.4s, v16.4s\n"
    "prfm pldl1keep, [x24, x15]\n"
    "fmla v8.4s, v24.4s, v15.4s\n"
    "ldr q24, [x22, x26]\n"
    "fmla v3.4s, v29.4s, v13.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v2.4s, v29.4s, v0.4s\n"
    "ldr q22, [x25, x9]\n"
    "fmla v7.4s, v30.4s, v11.4s\n"
    "ldr q21, [x24, x16]\n"
    "fmla v1.4s, v25.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v9.4s, v25.4s, v11.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v7.4s, v25.4s, v12.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "fmla v6.4s, v25.4s, v16.4s\n"
    "ldr q19, [x13, x26]\n"
    "fmla v4.4s, v26.4s, v14.4s\n"
    "prfm pldl1keep, [%[inptr0], x17]\n"
    "fmla v1.4s, v26.4s, v17.4s\n"
    "prfm pldl1keep, [%[inptr0], x23]\n"
    "fmla v3.4s, v26.4s, v10.4s\n"
    "add x25, x25, #16\n"
    "fmla v7.4s, v26.4s, v13.4s\n"
    "prfm pldl1keep, [x25, #64]\n"
    "fmla v9.4s, v26.4s, v12.4s\n"
    "prfm pldl1keep, [x25, x17]\n"
    "fmla v2.4s, v26.4s, v11.4s\n"
    "subs x28, x28, #1\n"
    "fmla v6.4s, v26.4s, v0.4s\n"
    "fmla v8.4s, v26.4s, v16.4s\n"
    "fmla v5.4s, v26.4s, v15.4s\n"
    "ldr q26, [x22, x9]\n"
    "fmla v3.4s, v23.4s, v17.4s\n"
    "ldr q18, [x24, x26]\n"
    "fmla v9.4s, v23.4s, v13.4s\n"
    "add x22, x22, #16\n"
    "fmla v2.4s, v23.4s, v12.4s\n"
    "prfm pldl1keep, [x22, #64]\n"
    "fmla v8.4s, v23.4s, v0.4s\n"
    "ldr q23, [x13, x9]\n"
    "fmla v7.4s, v28.4s, v10.4s\n"
    "prfm pldl1keep, [x22, x17]\n"
    "fmla v2.4s, v20.4s, v13.4s\n"
    "ldr q25, [x24, x9]\n"
    "fmla v6.4s, v28.4s, v11.4s\n"
    "ldr q20, [%[wbptr]]\n"
    "fmla v1.4s, v27.4s, v14.4s\n"
    "add x13, x13, #16\n"
    "fmla v7.4s, v27.4s, v17.4s\n"
    "prfm pldl1keep, [x13, #64]\n"
    "fmla v9.4s, v27.4s, v10.4s\n"
    "add x24, x24, #16\n"
    "fmla v6.4s, v27.4s, v12.4s\n"
    "fmla v8.4s, v27.4s, v11.4s\n"
    "fmla v5.4s, v27.4s, v16.4s\n"
    "ldr q15, [%[wbptr], #16]\n"
    "fmla v3.4s, v24.4s, v14.4s\n"
    "ldr q27, [%[inptr0]]\n"
    "fmla v9.4s, v24.4s, v17.4s\n"
    "fmla v2.4s, v24.4s, v10.4s\n"
    "fmla v6.4s, v24.4s, v13.4s\n"
    "fmla v8.4s, v24.4s, v12.4s\n"
    "fmla v5.4s, v24.4s, v0.4s\n"
    "ldr q16, [%[wbptr], #64]\n"
    "fmla v2.4s, v22.4s, v17.4s\n"
    "ldr q24, [x25]\n"
    "fmla v8.4s, v22.4s, v13.4s\n"
    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v7.4s, v21.4s, v14.4s\n"
    "fmla v6.4s, v21.4s, v10.4s\n"
    "fmla v5.4s, v21.4s, v11.4s\n"
    "ldr q0, [%[wbptr], #32]\n"
    "fmla v9.4s, v19.4s, v14.4s\n"
    "ldr q21, [x22]\n"
    "fmla v6.4s, v19.4s, v17.4s\n"
    "fmla v8.4s, v19.4s, v10.4s\n"
    "fmla v5.4s, v19.4s, v12.4s\n"
    "ldr q11, [%[wbptr], #112]\n"
    "fmla v2.4s, v26.4s, v14.4s\n"
    "movi v29.16b, #0\n"
    "fmla v8.4s, v26.4s, v17.4s\n"
    "fmla v6.4s, v18.4s, v14.4s\n"
    "fmla v5.4s, v26.4s, v13.4s\n"
    "ldr q12, [%[wbptr], #80]\n"
    "fmax v4.4s, v4.4s, v29.4s\n"
    "ldr q19, [x25, %[input_col_stride1]]\n"
    "fmla v8.4s, v23.4s, v14.4s\n"
    "fmax v3.4s, v3.4s, v29.4s\n"
    "str q4, [%[outptr0]]\n"
    "fmla v5.4s, v18.4s, v10.4s\n"
    "str q3, [%[outptr0], %[output_col_stride1]]\n"
    "fmax v2.4s, v2.4s, v29.4s\n"
    "fmax v1.4s, v1.4s, v29.4s\n"
    "ldr q13, [%[wbptr], #48]\n"
    "str q2, [%[outptr0], x19]\n"
    "fmla v5.4s, v23.4s, v17.4s\n"
    "str q1, [x21]\n"
    "fmax v9.4s, v9.4s, v29.4s\n"
    "fmax v8.4s, v8.4s, v29.4s\n"
    "ldr q10, [%[wbptr], #128]\n"
    "str q9, [x21, %[output_col_stride1]]\n"
    "fmla v5.4s, v25.4s, v14.4s\n"
    "str q8, [x21, x19]\n"
    "fmax v7.4s, v7.4s, v29.4s\n"
    "fmax v6.4s, v6.4s, v29.4s\n"
    "ldr q17, [%[wbptr], #96]\n"
    "str q7, [x14]\n"
    "fmax v5.4s, v5.4s, v29.4s\n"
    "str q6, [x14, %[output_col_stride1]]\n"
    "mov v4.16b, v20.16b\n"
    "str q5, [x14, x19]\n"
    "mov v1.16b, v20.16b\n"
    "mov v3.16b, v20.16b\n"
    "ldr q14, [%[wbptr], #144]\n"
    "mov v7.16b, v20.16b\n"
    "ldr q31, [%[inptr0], x16]\n"
    "mov v9.16b, v20.16b\n"
    "ldr q28, [x13]\n"
    "mov v2.16b, v20.16b\n"
    "ldr q18, [x22, %[input_col_stride1]]\n"
    "mov v6.16b, v20.16b\n"
    "add %[outptr0], %[outptr0], #16\n"
    "mov v8.16b, v20.16b\n"
    "add x21, x21, #16\n"
    "mov v5.16b, v20.16b\n"
    "add x14, x14, #16\n"
    "fmla v4.4s, v27.4s, v15.4s\n"
    "fmla v4.4s, v24.4s, v16.4s\n"
    "bne 2b\n"
    "3:\n"
    "fmla v1.4s, v24.4s, v15.4s\n"
    "ldr q24, [x25, x16]\n"
    "fmla v4.4s, v22.4s, v0.4s\n"
    "ldr q29, [%[inptr0], x26]\n"
    "fmla v3.4s, v22.4s, v15.4s\n"
    "ldr q30, [x24]\n"
    "fmla v1.4s, v21.4s, v16.4s\n"
    "ldr q25, [x13, %[input_col_stride1]]\n"
    "fmla v4.4s, v21.4s, v11.4s\n"
    "prfm pldl1keep, [x25, x23]\n"
    "fmla v7.4s, v21.4s, v15.4s\n"
    "ldr q26, [x22, x16]\n"
    "fmla v1.4s, v19.4s, v0.4s\n"
    "prfm pldl1keep, [%[inptr0], x20]\n"
    "fmla v4.4s, v19.4s, v12.4s\n"
    "prfm pldl1keep, [x24, #64]\n"
    "fmla v3.4s, v19.4s, v16.4s\n"
    "prfm pldl1keep, [x13, x17]\n"
    "fmla v9.4s, v19.4s, v15.4s\n"
    "ldr q23, [x25, x26]\n"
    "fmla v4.4s, v31.4s, v13.4s\n"
    "prfm pldl1keep, [x22, x23]\n"
    "fmla v3.4s, v31.4s, v0.4s\n"
    "prfm pldl1keep, [x25, x20]\n"
    "fmla v2.4s, v31.4s, v15.4s\n"
    "ldr q20, [%[inptr0], x9]\n"
    "fmla v1.4s, v28.4s, v11.4s\n"
    "prfm pldl1keep, [%[inptr0], x15]\n"
    "fmla v7.4s, v28.4s, v16.4s\n"
    "ldr q28, [x24, %[input_col_stride1]]\n"
    "fmla v4.4s, v18.4s, v10.4s\n"
    "prfm pldl1keep, [x24, x17]\n"
    "fmla v1.4s, v18.4s, v12.4s\n"
    "prfm pldl1keep, [x13, x23]\n"
    "fmla v3.4s, v18.4s, v11.4s\n"
    "prfm pldl1keep, [x22, x20]\n"
    "fmla v7.4s, v18.4s, v0.4s\n"
    "prfm pldl1keep, [x25, x15]\n"
    "fmla v9.4s, v18.4s, v16.4s\n"
    "prfm pldl1keep, [x24, x23]\n"
    "fmla v6.4s, v18.4s, v15.4s\n"
    "ldr q27, [x13, x16]\n"
    "fmla v4.4s, v24.4s, v17.4s\n"
    "prfm pldl1keep, [x13, x20]\n"
    "fmla v1.4s, v24.4s, v13.4s\n"
    "prfm pldl1keep, [x22, x15]\n"
    "fmla v3.4s, v24.4s, v12.4s\n"
    "prfm pldl1keep, [x24, x20]\n"
    "fmla v9.4s, v24.4s, v0.4s\n"
    "prfm pldl1keep, [x13, x15]\n"
    "fmla v2.4s, v24.4s, v16.4s\n"
    "prfm pldl1keep, [x24, x15]\n"
    "fmla v8.4s, v24.4s, v15.4s\n"
    "ldr q24, [x22, x26]\n"
    "fmla v3.4s, v29.4s, v13.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v2.4s, v29.4s, v0.4s\n"
    "ldr q22, [x25, x9]\n"
    "fmla v7.4s, v30.4s, v11.4s\n"
    "ldr q21, [x24, x16]\n"
    "fmla v1.4s, v25.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v9.4s, v25.4s, v11.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v7.4s, v25.4s, v12.4s\n"
    "add x25, x25, #16\n"
    "fmla v6.4s, v25.4s, v16.4s\n"
    "ldr q19, [x13, x26]\n"
    "fmla v4.4s, v26.4s, v14.4s\n"
    "fmla v1.4s, v26.4s, v17.4s\n"
    "fmla v3.4s, v26.4s, v10.4s\n"
    "fmla v7.4s, v26.4s, v13.4s\n"
    "fmla v9.4s, v26.4s, v12.4s\n"
    "fmla v2.4s, v26.4s, v11.4s\n"
    "fmla v6.4s, v26.4s, v0.4s\n"
    "fmla v8.4s, v26.4s, v16.4s\n"
    "fmla v5.4s, v26.4s, v15.4s\n"
    "ldr q26, [x22, x9]\n"
    "fmla v3.4s, v23.4s, v17.4s\n"
    "ldr q18, [x24, x26]\n"
    "fmla v9.4s, v23.4s, v13.4s\n"
    "add x22, x22, #16\n"
    "fmla v2.4s, v23.4s, v12.4s\n"
    "fmla v8.4s, v23.4s, v0.4s\n"
    "fmla v7.4s, v28.4s, v10.4s\n"
    "ldr q23, [x13, x9]\n"
    "fmla v6.4s, v28.4s, v11.4s\n"
    "ldr q25, [x24, x9]\n"
    "fmla v2.4s, v20.4s, v13.4s\n"
    "add x13, x13, #16\n"
    "fmla v1.4s, v27.4s, v14.4s\n"
    "add x24, x24, #16\n"
    "fmla v7.4s, v27.4s, v17.4s\n"
    "fmla v9.4s, v27.4s, v10.4s\n"
    "fmla v6.4s, v27.4s, v12.4s\n"
    "fmla v8.4s, v27.4s, v11.4s\n"
    "fmla v5.4s, v27.4s, v16.4s\n"
    "fmla v3.4s, v24.4s, v14.4s\n"
    "fmla v9.4s, v24.4s, v17.4s\n"
    "fmla v2.4s, v24.4s, v10.4s\n"
    "fmla v6.4s, v24.4s, v13.4s\n"
    "fmla v8.4s, v24.4s, v12.4s\n"
    "fmla v5.4s, v24.4s, v0.4s\n"
    "fmla v7.4s, v21.4s, v14.4s\n"
    "fmla v2.4s, v22.4s, v17.4s\n"
    "fmla v9.4s, v19.4s, v14.4s\n"
    "fmla v8.4s, v22.4s, v13.4s\n"
    "fmla v6.4s, v21.4s, v10.4s\n"
    "fmla v5.4s, v21.4s, v11.4s\n"
    "movi v29.16b, #0\n"
    "fmla v2.4s, v26.4s, v14.4s\n"
    "fmla v6.4s, v19.4s, v17.4s\n"
    "fmla v8.4s, v19.4s, v10.4s\n"
    "fmla v5.4s, v19.4s, v12.4s\n"
    "fmax v4.4s, v4.4s, v29.4s\n"
    "fmax v3.4s, v3.4s, v29.4s\n"
    "fmla v6.4s, v18.4s, v14.4s\n"
    "fmax v2.4s, v2.4s, v29.4s\n"
    "str q4, [%[outptr0]]\n"
    "fmla v8.4s, v26.4s, v17.4s\n"
    "str q3, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v5.4s, v26.4s, v13.4s\n"
    "str q2, [%[outptr0], x19]\n"
    "fmax v1.4s, v1.4s, v29.4s\n"
    "fmla v8.4s, v23.4s, v14.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "str q1, [x21]\n"
    "fmla v5.4s, v18.4s, v10.4s\n"
    "fmax v9.4s, v9.4s, v29.4s\n"
    "fmax v7.4s, v7.4s, v29.4s\n"
    "fmax v8.4s, v8.4s, v29.4s\n"
    "fmax v6.4s, v6.4s, v29.4s\n"
    "str q9, [x21, %[output_col_stride1]]\n"
    "fmla v5.4s, v23.4s, v17.4s\n"
    "str q8, [x21, x19]\n"
    "str q7, [x14]\n"
    "str q6, [x14, %[output_col_stride1]]\n"
    "add x21, x21, #16\n"
    "fmla v5.4s, v25.4s, v14.4s\n"
    "fmax v5.4s, v5.4s, v29.4s\n"
    "str q5, [x14, x19]\n"
    "add x14, x14, #16\n"
    "4:\n"
    "cbz x27, 7f\n"
    "ldr s20, [%[wbptr]]\n"
    "mov v4.16b, v20.16b\n"
    "ldr s15, [%[wbptr], #4]\n"
    "mov v1.16b, v20.16b\n"
    "ldr s0, [%[wbptr], #8]\n"
    "mov v3.16b, v20.16b\n"
    "ldr s13, [%[wbptr], #12]\n"
    "mov v7.16b, v20.16b\n"
    "ldr s16, [%[wbptr], #16]\n"
    "mov v9.16b, v20.16b\n"
    "ldr s12, [%[wbptr], #20]\n"
    "mov v2.16b, v20.16b\n"
    "ldr s17, [%[wbptr], #24]\n"
    "mov v6.16b, v20.16b\n"
    "ldr s11, [%[wbptr], #28]\n"
    "mov v8.16b, v20.16b\n"
    "ldr s10, [%[wbptr], #32]\n"
    "mov v5.16b, v20.16b\n"
    "ldr s14, [%[wbptr], #36]\n"
    "ldr s27, [%[inptr0]]\n"
    "subs x27, x27, #1\n"
    "fmla v4.4s, v27.4s, v15.4s\n"
    "ldr s24, [x25]\n"
    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
    "ldr s21, [x22]\n"
    "ldr s19, [x25, %[input_col_stride1]]\n"
    "ldr s31, [%[inptr0], x16]\n"
    "fmla v4.4s, v24.4s, v16.4s\n"
    "ldr s28, [x13]\n"
    "ldr s18, [x22, %[input_col_stride1]]\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "prfm pldl1keep, [x25, #64]\n"
    "prfm pldl1keep, [%[inptr0], x17]\n"
    "prfm pldl1keep, [x22, #64]\n"
    "prfm pldl1keep, [x25, x17]\n"
    "prfm pldl1keep, [%[inptr0], x23]\n"
    "prfm pldl1keep, [x13, #64]\n"
    "prfm pldl1keep, [x22, x17]\n"
    "beq 6f\n"
    "5:\n"
    "fmla v1.4s, v24.4s, v15.4s\n"
    "ldr s24, [x25, x16]\n"
    "fmla v4.4s, v22.4s, v0.4s\n"
    "ldr s29, [%[inptr0], x26]\n"
    "fmla v3.4s, v22.4s, v15.4s\n"
    "ldr s30, [x24]\n"
    "fmla v1.4s, v21.4s, v16.4s\n"
    "ldr s25, [x13, %[input_col_stride1]]\n"
    "fmla v4.4s, v21.4s, v11.4s\n"
    "prfm pldl1keep, [x25, x23]\n"
    "fmla v7.4s, v21.4s, v15.4s\n"
    "ldr s26, [x22, x16]\n"
    "fmla v1.4s, v19.4s, v0.4s\n"
    "prfm pldl1keep, [%[inptr0], x20]\n"
    "fmla v4.4s, v19.4s, v12.4s\n"
    "prfm pldl1keep, [x24, #64]\n"
    "fmla v3.4s, v19.4s, v16.4s\n"
    "prfm pldl1keep, [x13, x17]\n"
    "fmla v9.4s, v19.4s, v15.4s\n"
    "ldr s23, [x25, x26]\n"
    "fmla v4.4s, v31.4s, v13.4s\n"
    "prfm pldl1keep, [x22, x23]\n"
    "fmla v3.4s, v31.4s, v0.4s\n"
    "prfm pldl1keep, [x25, x20]\n"
    "fmla v2.4s, v31.4s, v15.4s\n"
    "ldr s20, [%[inptr0], x9]\n"
    "fmla v1.4s, v28.4s, v11.4s\n"
    "prfm pldl1keep, [%[inptr0], x15]\n"
    "fmla v7.4s, v28.4s, v16.4s\n"
    "ldr s28, [x24, %[input_col_stride1]]\n"
    "fmla v4.4s, v18.4s, v10.4s\n"
    "prfm pldl1keep, [x24, x17]\n"
    "fmla v1.4s, v18.4s, v12.4s\n"
    "prfm pldl1keep, [x13, x23]\n"
    "fmla v3.4s, v18.4s, v11.4s\n"
    "prfm pldl1keep, [x22, x20]\n"
    "fmla v7.4s, v18.4s, v0.4s\n"
    "prfm pldl1keep, [x25, x15]\n"
    "fmla v9.4s, v18.4s, v16.4s\n"
    "prfm pldl1keep, [x24, x23]\n"
    "fmla v6.4s, v18.4s, v15.4s\n"
    "ldr s27, [x13, x16]\n"
    "fmla v4.4s, v24.4s, v17.4s\n"
    "prfm pldl1keep, [x13, x20]\n"
    "fmla v1.4s, v24.4s, v13.4s\n"
    "prfm pldl1keep, [x22, x15]\n"
    "fmla v3.4s, v24.4s, v12.4s\n"
    "prfm pldl1keep, [x24, x20]\n"
    "fmla v9.4s, v24.4s, v0.4s\n"
    "prfm pldl1keep, [x13, x15]\n"
    "fmla v2.4s, v24.4s, v16.4s\n"
    "prfm pldl1keep, [x24, x15]\n"
    "fmla v8.4s, v24.4s, v15.4s\n"
    "ldr s24, [x22, x26]\n"
    "fmla v3.4s, v29.4s, v13.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v2.4s, v29.4s, v0.4s\n"
    "ldr s22, [x25, x9]\n"
    "fmla v7.4s, v30.4s, v11.4s\n"
    "ldr s21, [x24, x16]\n"
    "fmla v1.4s, v25.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v9.4s, v25.4s, v11.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v7.4s, v25.4s, v12.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "fmla v6.4s, v25.4s, v16.4s\n"
    "ldr s19, [x13, x26]\n"
    "fmla v4.4s, v26.4s, v14.4s\n"
    "prfm pldl1keep, [%[inptr0], x17]\n"
    "fmla v1.4s, v26.4s, v17.4s\n"
    "prfm pldl1keep, [%[inptr0], x23]\n"
    "fmla v3.4s, v26.4s, v10.4s\n"
    "add x25, x25, #4\n"
    "fmla v7.4s, v26.4s, v13.4s\n"
    "prfm pldl1keep, [x25, #64]\n"
    "fmla v9.4s, v26.4s, v12.4s\n"
    "prfm pldl1keep, [x25, x17]\n"
    "fmla v2.4s, v26.4s, v11.4s\n"
    "subs x27, x27, #1\n"
    "fmla v6.4s, v26.4s, v0.4s\n"
    "fmla v8.4s, v26.4s, v16.4s\n"
    "fmla v5.4s, v26.4s, v15.4s\n"
    "ldr s26, [x22, x9]\n"
    "fmla v3.4s, v23.4s, v17.4s\n"
    "ldr s18, [x24, x26]\n"
    "fmla v9.4s, v23.4s, v13.4s\n"
    "add x22, x22, #4\n"
    "fmla v2.4s, v23.4s, v12.4s\n"
    "prfm pldl1keep, [x22, #64]\n"
    "fmla v8.4s, v23.4s, v0.4s\n"
    "ldr s23, [x13, x9]\n"
    "fmla v7.4s, v28.4s, v10.4s\n"
    "prfm pldl1keep, [x22, x17]\n"
    "fmla v2.4s, v20.4s, v13.4s\n"
    "ldr s25, [x24, x9]\n"
    "fmla v6.4s, v28.4s, v11.4s\n"
    "ldr s20, [%[wbptr]]\n"
    "fmla v1.4s, v27.4s, v14.4s\n"
    "add x13, x13, #4\n"
    "fmla v7.4s, v27.4s, v17.4s\n"
    "prfm pldl1keep, [x13, #64]\n"
    "fmla v9.4s, v27.4s, v10.4s\n"
    "add x24, x24, #4\n"
    "fmla v6.4s, v27.4s, v12.4s\n"
    "fmla v8.4s, v27.4s, v11.4s\n"
    "fmla v5.4s, v27.4s, v16.4s\n"
    "ldr s15, [%[wbptr], #4]\n"
    "fmla v3.4s, v24.4s, v14.4s\n"
    "ldr s27, [%[inptr0]]\n"
    "fmla v9.4s, v24.4s, v17.4s\n"
    "fmla v2.4s, v24.4s, v10.4s\n"
    "fmla v6.4s, v24.4s, v13.4s\n"
    "fmla v8.4s, v24.4s, v12.4s\n"
    "fmla v5.4s, v24.4s, v0.4s\n"
    "ldr s16, [%[wbptr], #16]\n"
    "fmla v2.4s, v22.4s, v17.4s\n"
    "ldr s24, [x25]\n"
    "fmla v8.4s, v22.4s, v13.4s\n"
    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v7.4s, v21.4s, v14.4s\n"
    "fmla v6.4s, v21.4s, v10.4s\n"
    "fmla v5.4s, v21.4s, v11.4s\n"
    "ldr s0, [%[wbptr], #8]\n"
    "fmla v9.4s, v19.4s, v14.4s\n"
    "ldr s21, [x22]\n"
    "fmla v6.4s, v19.4s, v17.4s\n"
    "fmla v8.4s, v19.4s, v10.4s\n"
    "fmla v5.4s, v19.4s, v12.4s\n"
    "ldr s11, [%[wbptr], #28]\n"
    "fmla v2.4s, v26.4s, v14.4s\n"
    "movi v29.16b, #0\n"
    "fmla v8.4s, v26.4s, v17.4s\n"
    "fmla v6.4s, v18.4s, v14.4s\n"
    "fmla v5.4s, v26.4s, v13.4s\n"
    "ldr s12, [%[wbptr], #20]\n"
    "fmax v4.4s, v4.4s, v29.4s\n"
    "ldr s19, [x25, %[input_col_stride1]]\n"
    "fmla v8.4s, v23.4s, v14.4s\n"
    "fmax v3.4s, v3.4s, v29.4s\n"
    "str s4, [%[outptr0]]\n"
    "fmla v5.4s, v18.4s, v10.4s\n"
    "str s3, [%[outptr0], %[output_col_stride1]]\n"
    "fmax v2.4s, v2.4s, v29.4s\n"
    "fmax v1.4s, v1.4s, v29.4s\n"
    "ldr s13, [%[wbptr], #12]\n"
    "str s2, [%[outptr0], x19]\n"
    "fmla v5.4s, v23.4s, v17.4s\n"
    "str s1, [x21]\n"
    "fmax v9.4s, v9.4s, v29.4s\n"
    "fmax v8.4s, v8.4s, v29.4s\n"
    "ldr s10, [%[wbptr], #32]\n"
    "str s9, [x21, %[output_col_stride1]]\n"
    "fmla v5.4s, v25.4s, v14.4s\n"
    "str s8, [x21, x19]\n"
    "fmax v7.4s, v7.4s, v29.4s\n"
    "fmax v6.4s, v6.4s, v29.4s\n"
    "ldr s17, [%[wbptr], #24]\n"
    "str s7, [x14]\n"
    "fmax v5.4s, v5.4s, v29.4s\n"
    "str s6, [x14, %[output_col_stride1]]\n"
    "mov v4.16b, v20.16b\n"
    "str s5, [x14, x19]\n"
    "mov v1.16b, v20.16b\n"
    "mov v3.16b, v20.16b\n"
    "ldr s14, [%[wbptr], #36]\n"
    "mov v7.16b, v20.16b\n"
    "ldr s31, [%[inptr0], x16]\n"
    "mov v9.16b, v20.16b\n"
    "ldr s28, [x13]\n"
    "mov v2.16b, v20.16b\n"
    "ldr s18, [x22, %[input_col_stride1]]\n"
    "mov v6.16b, v20.16b\n"
    "add %[outptr0], %[outptr0], #4\n"
    "mov v8.16b, v20.16b\n"
    "add x21, x21, #4\n"
    "mov v5.16b, v20.16b\n"
    "add x14, x14, #4\n"
    "fmla v4.4s, v27.4s, v15.4s\n"
    "fmla v4.4s, v24.4s, v16.4s\n"
    "bne 5b\n"
    "6:\n"
    "fmla v1.4s, v24.4s, v15.4s\n"
    "ldr s24, [x25, x16]\n"
    "fmla v4.4s, v22.4s, v0.4s\n"
    "ldr s29, [%[inptr0], x26]\n"
    "fmla v3.4s, v22.4s, v15.4s\n"
    "ldr s30, [x24]\n"
    "fmla v1.4s, v21.4s, v16.4s\n"
    "ldr s25, [x13, %[input_col_stride1]]\n"
    "fmla v4.4s, v21.4s, v11.4s\n"
    "prfm pldl1keep, [x25, x23]\n"
    "fmla v7.4s, v21.4s, v15.4s\n"
    "ldr s26, [x22, x16]\n"
    "fmla v1.4s, v19.4s, v0.4s\n"
    "prfm pldl1keep, [%[inptr0], x20]\n"
    "fmla v4.4s, v19.4s, v12.4s\n"
    "prfm pldl1keep, [x24, #64]\n"
    "fmla v3.4s, v19.4s, v16.4s\n"
    "prfm pldl1keep, [x13, x17]\n"
    "fmla v9.4s, v19.4s, v15.4s\n"
    "ldr s23, [x25, x26]\n"
    "fmla v4.4s, v31.4s, v13.4s\n"
    "prfm pldl1keep, [x22, x23]\n"
    "fmla v3.4s, v31.4s, v0.4s\n"
    "prfm pldl1keep, [x25, x20]\n"
    "fmla v2.4s, v31.4s, v15.4s\n"
    "ldr s20, [%[inptr0], x9]\n"
    "fmla v1.4s, v28.4s, v11.4s\n"
    "prfm pldl1keep, [%[inptr0], x15]\n"
    "fmla v7.4s, v28.4s, v16.4s\n"
    "ldr s28, [x24, %[input_col_stride1]]\n"
    "fmla v4.4s, v18.4s, v10.4s\n"
    "prfm pldl1keep, [x24, x17]\n"
    "fmla v1.4s, v18.4s, v12.4s\n"
    "prfm pldl1keep, [x13, x23]\n"
    "fmla v3.4s, v18.4s, v11.4s\n"
    "prfm pldl1keep, [x22, x20]\n"
    "fmla v7.4s, v18.4s, v0.4s\n"
    "prfm pldl1keep, [x25, x15]\n"
    "fmla v9.4s, v18.4s, v16.4s\n"
    "prfm pldl1keep, [x24, x23]\n"
    "fmla v6.4s, v18.4s, v15.4s\n"
    "ldr s27, [x13, x16]\n"
    "fmla v4.4s, v24.4s, v17.4s\n"
    "prfm pldl1keep, [x13, x20]\n"
    "fmla v1.4s, v24.4s, v13.4s\n"
    "prfm pldl1keep, [x22, x15]\n"
    "fmla v3.4s, v24.4s, v12.4s\n"
    "prfm pldl1keep, [x24, x20]\n"
    "fmla v9.4s, v24.4s, v0.4s\n"
    "prfm pldl1keep, [x13, x15]\n"
    "fmla v2.4s, v24.4s, v16.4s\n"
    "prfm pldl1keep, [x24, x15]\n"
    "fmla v8.4s, v24.4s, v15.4s\n"
    "ldr s24, [x22, x26]\n"
    "fmla v3.4s, v29.4s, v13.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v2.4s, v29.4s, v0.4s\n"
    "ldr s22, [x25, x9]\n"
    "fmla v7.4s, v30.4s, v11.4s\n"
    "ldr s21, [x24, x16]\n"
    "fmla v1.4s, v25.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v9.4s, v25.4s, v11.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v7.4s, v25.4s, v12.4s\n"
    "add x25, x25, #4\n"
    "fmla v6.4s, v25.4s, v16.4s\n"
    "ldr s19, [x13, x26]\n"
    "fmla v4.4s, v26.4s, v14.4s\n"
    "fmla v1.4s, v26.4s, v17.4s\n"
    "fmla v3.4s, v26.4s, v10.4s\n"
    "fmla v7.4s, v26.4s, v13.4s\n"
    "fmla v9.4s, v26.4s, v12.4s\n"
    "fmla v2.4s, v26.4s, v11.4s\n"
    "fmla v6.4s, v26.4s, v0.4s\n"
    "fmla v8.4s, v26.4s, v16.4s\n"
    "fmla v5.4s, v26.4s, v15.4s\n"
    "ldr s26, [x22, x9]\n"
    "fmla v3.4s, v23.4s, v17.4s\n"
    "ldr s18, [x24, x26]\n"
    "fmla v9.4s, v23.4s, v13.4s\n"
    "add x22, x22, #4\n"
    "fmla v2.4s, v23.4s, v12.4s\n"
    "fmla v8.4s, v23.4s, v0.4s\n"
    "fmla v7.4s, v28.4s, v10.4s\n"
    "ldr s23, [x13, x9]\n"
    "fmla v6.4s, v28.4s, v11.4s\n"
    "ldr s25, [x24, x9]\n"
    "fmla v2.4s, v20.4s, v13.4s\n"
    "add x13, x13, #4\n"
    "fmla v1.4s, v27.4s, v14.4s\n"
    "add x24, x24, #4\n"
    "fmla v7.4s, v27.4s, v17.4s\n"
    "fmla v9.4s, v27.4s, v10.4s\n"
    "fmla v6.4s, v27.4s, v12.4s\n"
    "fmla v8.4s, v27.4s, v11.4s\n"
    "fmla v5.4s, v27.4s, v16.4s\n"
    "fmla v3.4s, v24.4s, v14.4s\n"
    "fmla v9.4s, v24.4s, v17.4s\n"
    "fmla v2.4s, v24.4s, v10.4s\n"
    "fmla v6.4s, v24.4s, v13.4s\n"
    "fmla v8.4s, v24.4s, v12.4s\n"
    "fmla v5.4s, v24.4s, v0.4s\n"
    "fmla v7.4s, v21.4s, v14.4s\n"
    "fmla v2.4s, v22.4s, v17.4s\n"
    "fmla v9.4s, v19.4s, v14.4s\n"
    "fmla v8.4s, v22.4s, v13.4s\n"
    "fmla v6.4s, v21.4s, v10.4s\n"
    "fmla v5.4s, v21.4s, v11.4s\n"
    "movi v29.16b, #0\n"
    "fmla v2.4s, v26.4s, v14.4s\n"
    "fmla v6.4s, v19.4s, v17.4s\n"
    "fmla v8.4s, v19.4s, v10.4s\n"
    "fmla v5.4s, v19.4s, v12.4s\n"
    "fmax v4.4s, v4.4s, v29.4s\n"
    "fmax v3.4s, v3.4s, v29.4s\n"
    "fmla v6.4s, v18.4s, v14.4s\n"
    "fmax v2.4s, v2.4s, v29.4s\n"
    "str s4, [%[outptr0]]\n"
    "fmla v8.4s, v26.4s, v17.4s\n"
    "str s3, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v5.4s, v26.4s, v13.4s\n"
    "str s2, [%[outptr0], x19]\n"
    "fmax v1.4s, v1.4s, v29.4s\n"
    "fmla v8.4s, v23.4s, v14.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "str s1, [x21]\n"
    "fmla v5.4s, v18.4s, v10.4s\n"
    "fmax v9.4s, v9.4s, v29.4s\n"
    "fmax v7.4s, v7.4s, v29.4s\n"
    "fmax v8.4s, v8.4s, v29.4s\n"
    "fmax v6.4s, v6.4s, v29.4s\n"
    "str s9, [x21, %[output_col_stride1]]\n"
    "fmla v5.4s, v23.4s, v17.4s\n"
    "str s8, [x21, x19]\n"
    "str s7, [x14]\n"
    "str s6, [x14, %[output_col_stride1]]\n"
    "add x21, x21, #4\n"
    "fmla v5.4s, v25.4s, v14.4s\n"
    "fmax v5.4s, v5.4s, v29.4s\n"
    "str s5, [x14, x19]\n"
    "add x14, x14, #4\n"
    "7:\n"
    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
  );
}

template <>
template <>
void Conv::execute_tile<ActivationFunction::ReLU6>(
  int n_channels,
  const void *weight_bias_ptr,
  const float *input,
  const unsigned int input_row_stride,
  const unsigned int input_col_stride,
  float *output,
  const unsigned int output_row_stride,
  const unsigned int output_col_stride
)
{
  __asm __volatile(
    "add x17, %[inptr0], %[input_row_stride]\n"
    "add x9, %[input_col_stride1], %[input_col_stride1]\n"
    "add x25, %[outptr0], %[output_row_stride]\n"
    "add x14, x17, %[input_row_stride]\n"
    "add x22, x9, #64\n"
    "add x15, x9, %[input_col_stride1]\n"
    "add x21, x14, %[input_row_stride]\n"
    "add x16, x15, #64\n"
    "add x24, x15, %[input_col_stride1]\n"
    "add x26, x21, %[input_row_stride]\n"
    "add x23, x24, #64\n"
    "add x13, x25, %[output_row_stride]\n"
    "add x27, %[output_col_stride1], %[output_col_stride1]\n"
    "and x19, %[n_channels], #3\n"
    "lsr x20, %[n_channels], #2\n"
    "cbz x20, 4f\n"
    "1:\n"
    "ldr q19, [%[wbptr]]\n"
    "subs x20, x20, #1\n"
    "mov v8.16b, v19.16b\n"
    "ldr q17, [%[wbptr], #16]\n"
    "mov v5.16b, v19.16b\n"
    "ldr q16, [%[wbptr], #32]\n"
    "mov v7.16b, v19.16b\n"
    "ldr q15, [%[wbptr], #48]\n"
    "mov v2.16b, v19.16b\n"
    "ldr q14, [%[wbptr], #64]\n"
    "mov v4.16b, v19.16b\n"
    "ldr q13, [%[wbptr], #80]\n"
    "mov v6.16b, v19.16b\n"
    "ldr q12, [%[wbptr], #96]\n"
    "mov v1.16b, v19.16b\n"
    "ldr q11, [%[wbptr], #112]\n"
    "mov v3.16b, v19.16b\n"
    "ldr q10, [%[wbptr], #128]\n"
    "mov v0.16b, v19.16b\n"
    "ldr q9, [%[wbptr], #144]\n"
    "ldr q25, [%[inptr0]]\n"
    "ldr q27, [x17]\n"
    "fmla v8.4s, v25.4s, v17.4s\n"
    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
    "ldr q20, [x14]\n"
    "ldr q22, [x17, %[input_col_stride1]]\n"
    "ldr q28, [%[inptr0], x9]\n"
    "ldr q23, [x21]\n"
    "fmla v8.4s, v27.4s, v14.4s\n"
    "ldr q18, [x14, %[input_col_stride1]]\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "prfm pldl1keep, [x17, #64]\n"
    "prfm pldl1keep, [%[inptr0], x28]\n"
    "prfm pldl1keep, [x14, #64]\n"
    "prfm pldl1keep, [x17, x28]\n"
    "prfm pldl1keep, [%[inptr0], x22]\n"
    "prfm pldl1keep, [x21, #64]\n"
    "prfm pldl1keep, [x14, x28]\n"
    "beq 3f\n"
    "2:\n"
    "fmla v5.4s, v27.4s, v17.4s\n"
    "ldr q27, [x17, x9]\n"
    "fmla v8.4s, v26.4s, v16.4s\n"
    "ldr q30, [%[inptr0], x15]\n"
    "fmla v7.4s, v26.4s, v17.4s\n"
    "ldr q31, [x26]\n"
    "fmla v5.4s, v20.4s, v14.4s\n"
    "ldr q24, [x21, %[input_col_stride1]]\n"
    "fmla v8.4s, v20.4s, v11.4s\n"
    "prfm pldl1keep, [x17, x22]\n"
    "fmla v2.4s, v20.4s, v17.4s\n"
    "ldr q29, [x14, x9]\n"
    "fmla v5.4s, v22.4s, v16.4s\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v8.4s, v22.4s, v13.4s\n"
    "prfm pldl1keep, [x26, #64]\n"
    "fmla v7.4s, v22.4s, v14.4s\n"
    "prfm pldl1keep, [x21, x28]\n"
    "fmla v4.4s, v22.4s, v17.4s\n"
    "ldr q21, [x17, x15]\n"
    "fmla v8.4s, v28.4s, v15.4s\n"
    "prfm pldl1keep, [x14, x22]\n"
    "fmla v7.4s, v28.4s, v16.4s\n"
    "prfm pldl1keep, [x17, x16]\n"
    "fmla v6.4s, v28.4s, v17.4s\n"
    "ldr q19, [%[inptr0], x24]\n"
    "fmla v5.4s, v23.4s, v11.4s\n"
    "prfm pldl1keep, [%[inptr0], x23]\n"
    "fmla v2.4s, v23.4s, v14.4s\n"
    "ldr q28, [x26, %[input_col_stride1]]\n"
    "fmla v8.4s, v18.4s, v10.4s\n"
    "prfm pldl1keep, [x26, x28]\n"
    "fmla v5.4s, v18.4s, v13.4s\n"
    "prfm pldl1keep, [x21, x22]\n"
    "fmla v7.4s, v18.4s, v11.4s\n"
    "prfm pldl1keep, [x14, x16]\n"
    "fmla v2.4s, v18.4s, v16.4s\n"
    "prfm pldl1keep, [x17, x23]\n"
    "fmla v4.4s, v18.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x22]\n"
    "fmla v1.4s, v18.4s, v17.4s\n"
    "ldr q25, [x21, x9]\n"
    "fmla v8.4s, v27.4s, v12.4s\n"
    "prfm pldl1keep, [x21, x16]\n"
    "fmla v5.4s, v27.4s, v15.4s\n"
    "prfm pldl1keep, [x14, x23]\n"
    "fmla v7.4s, v27.4s, v13.4s\n"
    "prfm pldl1keep, [x26, x16]\n"
    "fmla v4.4s, v27.4s, v16.4s\n"
    "prfm pldl1keep, [x21, x23]\n"
    "fmla v6.4s, v27.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x23]\n"
    "fmla v3.4s, v27.4s, v17.4s\n"
    "ldr q27, [x14, x15]\n"
    "fmla v7.4s, v30.4s, v15.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v6.4s, v30.4s, v16.4s\n"
    "ldr q26, [x17, x24]\n"
    "fmla v2.4s, v31.4s, v11.4s\n"
    "ldr q20, [x26, x9]\n"
    "fmla v5.4s, v24.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v4.4s, v24.4s, v11.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v2.4s, v24.4s, v13.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "fmla v1.4s, v24.4s, v14.4s\n"
    "ldr q18, [x21, x15]\n"
    "fmla v8.4s, v29.4s, v9.4s\n"
    "prfm pldl1keep, [%[inptr0], x28]\n"
    "fmla v5.4s, v29.4s, v12.4s\n"
    "prfm pldl1keep, [%[inptr0], x22]\n"
    "fmla v7.4s, v29.4s, v10.4s\n"
    "add x17, x17, #16\n"
    "fmla v2.4s, v29.4s, v15.4s\n"
    "prfm pldl1keep, [x17, #64]\n"
    "fmla v4.4s, v29.4s, v13.4s\n"
    "prfm pldl1keep, [x17, x28]\n"
    "fmla v6.4s, v29.4s, v11.4s\n"
    "subs x20, x20, #1\n"
    "fmla v1.4s, v29.4s, v16.4s\n"
    "fmla v3.4s, v29.4s, v14.4s\n"
    "fmla v0.4s, v29.4s, v17.4s\n"
    "ldr q22, [x14, x24]\n"
    "fmla v7.4s, v21.4s, v12.4s\n"
    "ldr q23, [x26, x15]\n"
    "fmla v4.4s, v21.4s, v15.4s\n"
    "add x14, x14, #16\n"
    "fmla v6.4s, v21.4s, v13.4s\n"
    "prfm pldl1keep, [x14, #64]\n"
    "fmla v3.4s, v21.4s, v16.4s\n"
    "ldr q24, [x21, x24]\n"
    "fmla v2.4s, v28.4s, v10.4s\n"
    "prfm pldl1keep, [x14, x28]\n"
    "fmla v6.4s, v19.4s, v15.4s\n"
    "ldr q21, [x26, x24]\n"
    "fmla v1.4s, v28.4s, v11.4s\n"
    "ldr q19, [%[wbptr]]\n"
    "fmla v5.4s, v25.4s, v9.4s\n"
    "add x21, x21, #16\n"
    "fmla v2.4s, v25.4s, v12.4s\n"
    "prfm pldl1keep, [x21, #64]\n"
    "fmla v4.4s, v25.4s, v10.4s\n"
    "add x26, x26, #16\n"
    "fmla v1.4s, v25.4s, v13.4s\n"
    "fmla v3.4s, v25.4s, v11.4s\n"
    "fmla v0.4s, v25.4s, v14.4s\n"
    "ldr q17, [%[wbptr], #16]\n"
    "fmla v7.4s, v27.4s, v9.4s\n"
    "ldr q25, [%[inptr0]]\n"
    "fmla v4.4s, v27.4s, v12.4s\n"
    "fmla v6.4s, v27.4s, v10.4s\n"
    "fmla v1.4s, v27.4s, v15.4s\n"
    "fmla v3.4s, v27.4s, v13.4s\n"
    "fmla v0.4s, v27.4s, v16.4s\n"
    "ldr q14, [%[wbptr], #64]\n"
    "fmla v6.4s, v26.4s, v12.4s\n"
    "ldr q27, [x17]\n"
    "fmla v3.4s, v26.4s, v15.4s\n"
    "ldr q26, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v2.4s, v20.4s, v9.4s\n"
    "fmla v1.4s, v20.4s, v10.4s\n"
    "fmla v0.4s, v20.4s, v11.4s\n"
    "ldr q16, [%[wbptr], #32]\n"
    "fmla v4.4s, v18.4s, v9.4s\n"
    "ldr q20, [x14]\n"
    "fmla v1.4s, v18.4s, v12.4s\n"
    "fmla v3.4s, v18.4s, v10.4s\n"
    "fmla v0.4s, v18.4s, v13.4s\n"
    "ldr q11, [%[wbptr], #112]\n"
    "fmla v6.4s, v22.4s, v9.4s\n"
    "movi v30.16b, #0\n"
    "fmla v3.4s, v22.4s, v12.4s\n"
    "fmla v1.4s, v23.4s, v9.4s\n"
    "fmla v0.4s, v22.4s, v15.4s\n"
    "ldr q13, [%[wbptr], #80]\n"
    "fmov v29.4s, #6.0\n"
    "fmax v8.4s, v8.4s, v30.4s\n"
    "fmla v3.4s, v24.4s, v9.4s\n"
    "fmax v7.4s, v7.4s, v30.4s\n"
    "fmla v0.4s, v23.4s, v10.4s\n"
    "ldr q15, [%[wbptr], #48]\n"
    "fmin v8.4s, v8.4s, v29.4s\n"
    "ldr q22, [x17, %[input_col_stride1]]\n"
    "fmin v7.4s, v7.4s, v29.4s\n"
    "fmax v6.4s, v6.4s, v30.4s\n"
    "str q8, [%[outptr0]]\n"
    "fmla v0.4s, v24.4s, v12.4s\n"
    "str q7, [%[outptr0], %[output_col_stride1]]\n"
    "fmin v6.4s, v6.4s, v29.4s\n"
    "fmax v5.4s, v5.4s, v30.4s\n"
    "ldr q10, [%[wbptr], #128]\n"
    "str q6, [%[outptr0], x27]\n"
    "fmla v0.4s, v21.4s, v9.4s\n"
    "fmin v5.4s, v5.4s, v29.4s\n"
    "ldr q12, [%[wbptr], #96]\n"
    "fmax v4.4s, v4.4s, v30.4s\n"
    "ldr q28, [%[inptr0], x9]\n"
    "str q5, [x25]\n"
    "fmax v3.4s, v3.4s, v30.4s\n"
    "fmin v4.4s, v4.4s, v29.4s\n"
    "ldr q9, [%[wbptr], #144]\n"
    "fmin v3.4s, v3.4s, v29.4s\n"
    "ldr q23, [x21]\n"
    "str q4, [x25, %[output_col_stride1]]\n"
    "fmax v2.4s, v2.4s, v30.4s\n"
    "str q3, [x25, x27]\n"
    "fmax v1.4s, v1.4s, v30.4s\n"
    "fmin v2.4s, v2.4s, v29.4s\n"
    "ldr q18, [x14, %[input_col_stride1]]\n"
    "fmin v1.4s, v1.4s, v29.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "str q2, [x13]\n"
    "fmax v0.4s, v0.4s, v30.4s\n"
    "str q1, [x13, %[output_col_stride1]]\n"
    "mov v8.16b, v19.16b\n"
    "fmin v0.4s, v0.4s, v29.4s\n"
    "add x25, x25, #16\n"
    "mov v5.16b, v19.16b\n"
    "mov v7.16b, v19.16b\n"
    "str q0, [x13, x27]\n"
    "mov v2.16b, v19.16b\n"
    "mov v4.16b, v19.16b\n"
    "add x13, x13, #16\n"
    "mov v6.16b, v19.16b\n"
    "mov v1.16b, v19.16b\n"
    "mov v3.16b, v19.16b\n"
    "mov v0.16b, v19.16b\n"
    "fmla v8.4s, v25.4s, v17.4s\n"
    "fmla v8.4s, v27.4s, v14.4s\n"
    "bne 2b\n"
    "3:\n"
    "fmla v5.4s, v27.4s, v17.4s\n"
    "ldr q27, [x17, x9]\n"
    "fmla v8.4s, v26.4s, v16.4s\n"
    "ldr q30, [%[inptr0], x15]\n"
    "fmla v7.4s, v26.4s, v17.4s\n"
    "ldr q31, [x26]\n"
    "fmla v5.4s, v20.4s, v14.4s\n"
    "ldr q24, [x21, %[input_col_stride1]]\n"
    "fmla v8.4s, v20.4s, v11.4s\n"
    "prfm pldl1keep, [x17, x22]\n"
    "fmla v2.4s, v20.4s, v17.4s\n"
    "ldr q29, [x14, x9]\n"
    "fmla v5.4s, v22.4s, v16.4s\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v8.4s, v22.4s, v13.4s\n"
    "prfm pldl1keep, [x26, #64]\n"
    "fmla v7.4s, v22.4s, v14.4s\n"
    "prfm pldl1keep, [x21, x28]\n"
    "fmla v4.4s, v22.4s, v17.4s\n"
    "ldr q21, [x17, x15]\n"
    "fmla v8.4s, v28.4s, v15.4s\n"
    "prfm pldl1keep, [x14, x22]\n"
    "fmla v7.4s, v28.4s, v16.4s\n"
    "prfm pldl1keep, [x17, x16]\n"
    "fmla v6.4s, v28.4s, v17.4s\n"
    "ldr q19, [%[inptr0], x24]\n"
    "fmla v5.4s, v23.4s, v11.4s\n"
    "prfm pldl1keep, [%[inptr0], x23]\n"
    "fmla v2.4s, v23.4s, v14.4s\n"
    "ldr q28, [x26, %[input_col_stride1]]\n"
    "fmla v8.4s, v18.4s, v10.4s\n"
    "prfm pldl1keep, [x26, x28]\n"
    "fmla v5.4s, v18.4s, v13.4s\n"
    "prfm pldl1keep, [x21, x22]\n"
    "fmla v7.4s, v18.4s, v11.4s\n"
    "prfm pldl1keep, [x14, x16]\n"
    "fmla v2.4s, v18.4s, v16.4s\n"
    "prfm pldl1keep, [x17, x23]\n"
    "fmla v4.4s, v18.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x22]\n"
    "fmla v1.4s, v18.4s, v17.4s\n"
    "ldr q25, [x21, x9]\n"
    "fmla v8.4s, v27.4s, v12.4s\n"
    "prfm pldl1keep, [x21, x16]\n"
    "fmla v5.4s, v27.4s, v15.4s\n"
    "prfm pldl1keep, [x14, x23]\n"
    "fmla v7.4s, v27.4s, v13.4s\n"
    "prfm pldl1keep, [x26, x16]\n"
    "fmla v4.4s, v27.4s, v16.4s\n"
    "prfm pldl1keep, [x21, x23]\n"
    "fmla v6.4s, v27.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x23]\n"
    "fmla v3.4s, v27.4s, v17.4s\n"
    "ldr q27, [x14, x15]\n"
    "fmla v7.4s, v30.4s, v15.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v6.4s, v30.4s, v16.4s\n"
    "ldr q26, [x17, x24]\n"
    "fmla v2.4s, v31.4s, v11.4s\n"
    "ldr q20, [x26, x9]\n"
    "fmla v5.4s, v24.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v4.4s, v24.4s, v11.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v2.4s, v24.4s, v13.4s\n"
    "add x17, x17, #16\n"
    "fmla v1.4s, v24.4s, v14.4s\n"
    "ldr q18, [x21, x15]\n"
    "fmla v8.4s, v29.4s, v9.4s\n"
    "fmla v5.4s, v29.4s, v12.4s\n"
    "fmla v7.4s, v29.4s, v10.4s\n"
    "fmla v2.4s, v29.4s, v15.4s\n"
    "fmla v4.4s, v29.4s, v13.4s\n"
    "fmla v6.4s, v29.4s, v11.4s\n"
    "fmla v1.4s, v29.4s, v16.4s\n"
    "fmla v3.4s, v29.4s, v14.4s\n"
    "fmla v0.4s, v29.4s, v17.4s\n"
    "ldr q22, [x14, x24]\n"
    "fmla v7.4s, v21.4s, v12.4s\n"
    "ldr q23, [x26, x15]\n"
    "fmla v4.4s, v21.4s, v15.4s\n"
    "add x14, x14, #16\n"
    "fmla v6.4s, v21.4s, v13.4s\n"
    "fmla v3.4s, v21.4s, v16.4s\n"
    "fmla v2.4s, v28.4s, v10.4s\n"
    "ldr q24, [x21, x24]\n"
    "fmla v1.4s, v28.4s, v11.4s\n"
    "ldr q21, [x26, x24]\n"
    "fmla v6.4s, v19.4s, v15.4s\n"
    "add x21, x21, #16\n"
    "fmla v5.4s, v25.4s, v9.4s\n"
    "add x26, x26, #16\n"
    "fmla v2.4s, v25.4s, v12.4s\n"
    "fmla v4.4s, v25.4s, v10.4s\n"
    "fmla v1.4s, v25.4s, v13.4s\n"
    "fmla v3.4s, v25.4s, v11.4s\n"
    "fmla v0.4s, v25.4s, v14.4s\n"
    "fmla v7.4s, v27.4s, v9.4s\n"
    "fmla v4.4s, v27.4s, v12.4s\n"
    "fmla v6.4s, v27.4s, v10.4s\n"
    "fmla v1.4s, v27.4s, v15.4s\n"
    "fmla v3.4s, v27.4s, v13.4s\n"
    "fmla v0.4s, v27.4s, v16.4s\n"
    "fmla v2.4s, v20.4s, v9.4s\n"
    "fmla v6.4s, v26.4s, v12.4s\n"
    "fmla v4.4s, v18.4s, v9.4s\n"
    "fmla v3.4s, v26.4s, v15.4s\n"
    "fmla v1.4s, v20.4s, v10.4s\n"
    "fmla v0.4s, v20.4s, v11.4s\n"
    "movi v30.16b, #0\n"
    "fmla v6.4s, v22.4s, v9.4s\n"
    "fmov v29.4s, #6.0\n"
    "fmla v1.4s, v18.4s, v12.4s\n"
    "fmla v3.4s, v18.4s, v10.4s\n"
    "fmla v0.4s, v18.4s, v13.4s\n"
    "fmax v8.4s, v8.4s, v30.4s\n"
    "fmax v7.4s, v7.4s, v30.4s\n"
    "fmax v6.4s, v6.4s, v30.4s\n"
    "fmla v3.4s, v22.4s, v12.4s\n"
    "fmla v1.4s, v23.4s, v9.4s\n"
    "fmla v0.4s, v22.4s, v15.4s\n"
    "fmin v8.4s, v8.4s, v29.4s\n"
    "fmin v7.4s, v7.4s, v29.4s\n"
    "fmin v6.4s, v6.4s, v29.4s\n"
    "str q8, [%[outptr0]]\n"
    "fmla v3.4s, v24.4s, v9.4s\n"
    "str q7, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v0.4s, v23.4s, v10.4s\n"
    "str q6, [%[outptr0], x27]\n"
    "fmax v5.4s, v5.4s, v30.4s\n"
    "fmax v4.4s, v4.4s, v30.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "fmla v0.4s, v24.4s, v12.4s\n"
    "fmin v5.4s, v5.4s, v29.4s\n"
    "fmin v4.4s, v4.4s, v29.4s\n"
    "fmax v3.4s, v3.4s, v30.4s\n"
    "str q5, [x25]\n"
    "fmax v2.4s, v2.4s, v30.4s\n"
    "str q4, [x25, %[output_col_stride1]]\n"
    "fmla v0.4s, v21.4s, v9.4s\n"
    "fmin v3.4s, v3.4s, v29.4s\n"
    "fmin v2.4s, v2.4s, v29.4s\n"
    "fmax v1.4s, v1.4s, v30.4s\n"
    "str q3, [x25, x27]\n"
    "str q2, [x13]\n"
    "fmin v1.4s, v1.4s, v29.4s\n"
    "fmax v0.4s, v0.4s, v30.4s\n"
    "add x25, x25, #16\n"
    "str q1, [x13, %[output_col_stride1]]\n"
    "fmin v0.4s, v0.4s, v29.4s\n"
    "str q0, [x13, x27]\n"
    "add x13, x13, #16\n"
    "4:\n"
    "cbz x19, 7f\n"
    "ldr s19, [%[wbptr]]\n"
    "mov v8.16b, v19.16b\n"
    "ldr s17, [%[wbptr], #4]\n"
    "mov v5.16b, v19.16b\n"
    "ldr s16, [%[wbptr], #8]\n"
    "mov v7.16b, v19.16b\n"
    "ldr s15, [%[wbptr], #12]\n"
    "mov v2.16b, v19.16b\n"
    "ldr s14, [%[wbptr], #16]\n"
    "mov v4.16b, v19.16b\n"
    "ldr s13, [%[wbptr], #20]\n"
    "mov v6.16b, v19.16b\n"
    "ldr s12, [%[wbptr], #24]\n"
    "mov v1.16b, v19.16b\n"
    "ldr s11, [%[wbptr], #28]\n"
    "mov v3.16b, v19.16b\n"
    "ldr s10, [%[wbptr], #32]\n"
    "mov v0.16b, v19.16b\n"
    "ldr s9, [%[wbptr], #36]\n"
    "ldr s25, [%[inptr0]]\n"
    "subs x19, x19, #1\n"
    "fmla v8.4s, v25.4s, v17.4s\n"
    "ldr s27, [x17]\n"
    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
    "ldr s20, [x14]\n"
    "ldr s22, [x17, %[input_col_stride1]]\n"
    "ldr s28, [%[inptr0], x9]\n"
    "fmla v8.4s, v27.4s, v14.4s\n"
    "ldr s23, [x21]\n"
    "ldr s18, [x14, %[input_col_stride1]]\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "prfm pldl1keep, [x17, #64]\n"
    "prfm pldl1keep, [%[inptr0], x28]\n"
    "prfm pldl1keep, [x14, #64]\n"
    "prfm pldl1keep, [x17, x28]\n"
    "prfm pldl1keep, [%[inptr0], x22]\n"
    "prfm pldl1keep, [x21, #64]\n"
    "prfm pldl1keep, [x14, x28]\n"
    "beq 6f\n"
    "5:\n"
    "fmla v5.4s, v27.4s, v17.4s\n"
    "ldr s27, [x17, x9]\n"
    "fmla v8.4s, v26.4s, v16.4s\n"
    "ldr s30, [%[inptr0], x15]\n"
    "fmla v7.4s, v26.4s, v17.4s\n"
    "ldr s31, [x26]\n"
    "fmla v5.4s, v20.4s, v14.4s\n"
    "ldr s24, [x21, %[input_col_stride1]]\n"
    "fmla v8.4s, v20.4s, v11.4s\n"
    "prfm pldl1keep, [x17, x22]\n"
    "fmla v2.4s, v20.4s, v17.4s\n"
    "ldr s29, [x14, x9]\n"
    "fmla v5.4s, v22.4s, v16.4s\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v8.4s, v22.4s, v13.4s\n"
    "prfm pldl1keep, [x26, #64]\n"
    "fmla v7.4s, v22.4s, v14.4s\n"
    "prfm pldl1keep, [x21, x28]\n"
    "fmla v4.4s, v22.4s, v17.4s\n"
    "ldr s21, [x17, x15]\n"
    "fmla v8.4s, v28.4s, v15.4s\n"
    "prfm pldl1keep, [x14, x22]\n"
    "fmla v7.4s, v28.4s, v16.4s\n"
    "prfm pldl1keep, [x17, x16]\n"
    "fmla v6.4s, v28.4s, v17.4s\n"
    "ldr s19, [%[inptr0], x24]\n"
    "fmla v5.4s, v23.4s, v11.4s\n"
    "prfm pldl1keep, [%[inptr0], x23]\n"
    "fmla v2.4s, v23.4s, v14.4s\n"
    "ldr s28, [x26, %[input_col_stride1]]\n"
    "fmla v8.4s, v18.4s, v10.4s\n"
    "prfm pldl1keep, [x26, x28]\n"
    "fmla v5.4s, v18.4s, v13.4s\n"
    "prfm pldl1keep, [x21, x22]\n"
    "fmla v7.4s, v18.4s, v11.4s\n"
    "prfm pldl1keep, [x14, x16]\n"
    "fmla v2.4s, v18.4s, v16.4s\n"
    "prfm pldl1keep, [x17, x23]\n"
    "fmla v4.4s, v18.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x22]\n"
    "fmla v1.4s, v18.4s, v17.4s\n"
    "ldr s25, [x21, x9]\n"
    "fmla v8.4s, v27.4s, v12.4s\n"
    "prfm pldl1keep, [x21, x16]\n"
    "fmla v5.4s, v27.4s, v15.4s\n"
    "prfm pldl1keep, [x14, x23]\n"
    "fmla v7.4s, v27.4s, v13.4s\n"
    "prfm pldl1keep, [x26, x16]\n"
    "fmla v4.4s, v27.4s, v16.4s\n"
    "prfm pldl1keep, [x21, x23]\n"
    "fmla v6.4s, v27.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x23]\n"
    "fmla v3.4s, v27.4s, v17.4s\n"
    "ldr s27, [x14, x15]\n"
    "fmla v7.4s, v30.4s, v15.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v6.4s, v30.4s, v16.4s\n"
    "ldr s26, [x17, x24]\n"
    "fmla v2.4s, v31.4s, v11.4s\n"
    "ldr s20, [x26, x9]\n"
    "fmla v5.4s, v24.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v4.4s, v24.4s, v11.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v2.4s, v24.4s, v13.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "fmla v1.4s, v24.4s, v14.4s\n"
    "ldr s18, [x21, x15]\n"
    "fmla v8.4s, v29.4s, v9.4s\n"
    "prfm pldl1keep, [%[inptr0], x28]\n"
    "fmla v5.4s, v29.4s, v12.4s\n"
    "prfm pldl1keep, [%[inptr0], x22]\n"
    "fmla v7.4s, v29.4s, v10.4s\n"
    "add x17, x17, #4\n"
    "fmla v2.4s, v29.4s, v15.4s\n"
    "prfm pldl1keep, [x17, #64]\n"
    "fmla v4.4s, v29.4s, v13.4s\n"
    "prfm pldl1keep, [x17, x28]\n"
    "fmla v6.4s, v29.4s, v11.4s\n"
    "subs x19, x19, #1\n"
    "fmla v1.4s, v29.4s, v16.4s\n"
    "fmla v3.4s, v29.4s, v14.4s\n"
    "fmla v0.4s, v29.4s, v17.4s\n"
    "ldr s22, [x14, x24]\n"
    "fmla v7.4s, v21.4s, v12.4s\n"
    "ldr s23, [x26, x15]\n"
    "fmla v4.4s, v21.4s, v15.4s\n"
    "add x14, x14, #4\n"
    "fmla v6.4s, v21.4s, v13.4s\n"
    "prfm pldl1keep, [x14, #64]\n"
    "fmla v3.4s, v21.4s, v16.4s\n"
    "ldr s24, [x21, x24]\n"
    "fmla v2.4s, v28.4s, v10.4s\n"
    "prfm pldl1keep, [x14, x28]\n"
    "fmla v6.4s, v19.4s, v15.4s\n"
    "ldr s21, [x26, x24]\n"
    "fmla v1.4s, v28.4s, v11.4s\n"
    "ldr s19, [%[wbptr]]\n"
    "fmla v5.4s, v25.4s, v9.4s\n"
    "add x21, x21, #4\n"
    "fmla v2.4s, v25.4s, v12.4s\n"
    "prfm pldl1keep, [x21, #64]\n"
    "fmla v4.4s, v25.4s, v10.4s\n"
    "add x26, x26, #4\n"
    "fmla v1.4s, v25.4s, v13.4s\n"
    "fmla v3.4s, v25.4s, v11.4s\n"
    "fmla v0.4s, v25.4s, v14.4s\n"
    "ldr s17, [%[wbptr], #4]\n"
    "fmla v7.4s, v27.4s, v9.4s\n"
    "ldr s25, [%[inptr0]]\n"
    "fmla v4.4s, v27.4s, v12.4s\n"
    "fmla v6.4s, v27.4s, v10.4s\n"
    "fmla v1.4s, v27.4s, v15.4s\n"
    "fmla v3.4s, v27.4s, v13.4s\n"
    "fmla v0.4s, v27.4s, v16.4s\n"
    "ldr s14, [%[wbptr], #16]\n"
    "fmla v6.4s, v26.4s, v12.4s\n"
    "ldr s27, [x17]\n"
    "fmla v3.4s, v26.4s, v15.4s\n"
    "ldr s26, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v2.4s, v20.4s, v9.4s\n"
    "fmla v1.4s, v20.4s, v10.4s\n"
    "fmla v0.4s, v20.4s, v11.4s\n"
    "ldr s16, [%[wbptr], #8]\n"
    "fmla v4.4s, v18.4s, v9.4s\n"
    "ldr s20, [x14]\n"
    "fmla v1.4s, v18.4s, v12.4s\n"
    "fmla v3.4s, v18.4s, v10.4s\n"
    "fmla v0.4s, v18.4s, v13.4s\n"
    "ldr s11, [%[wbptr], #28]\n"
    "fmla v6.4s, v22.4s, v9.4s\n"
    "movi v30.16b, #0\n"
    "fmla v3.4s, v22.4s, v12.4s\n"
    "fmla v1.4s, v23.4s, v9.4s\n"
    "fmla v0.4s, v22.4s, v15.4s\n"
    "ldr s13, [%[wbptr], #20]\n"
    "fmov v29.4s, #6.0\n"
    "fmax v8.4s, v8.4s, v30.4s\n"
    "fmla v3.4s, v24.4s, v9.4s\n"
    "fmax v7.4s, v7.4s, v30.4s\n"
    "fmla v0.4s, v23.4s, v10.4s\n"
    "ldr s15, [%[wbptr], #12]\n"
    "fmin v8.4s, v8.4s, v29.4s\n"
    "ldr s22, [x17, %[input_col_stride1]]\n"
    "fmin v7.4s, v7.4s, v29.4s\n"
    "fmax v6.4s, v6.4s, v30.4s\n"
    "str s8, [%[outptr0]]\n"
    "fmla v0.4s, v24.4s, v12.4s\n"
    "str s7, [%[outptr0], %[output_col_stride1]]\n"
    "fmin v6.4s, v6.4s, v29.4s\n"
    "fmax v5.4s, v5.4s, v30.4s\n"
    "ldr s10, [%[wbptr], #32]\n"
    "str s6, [%[outptr0], x27]\n"
    "fmla v0.4s, v21.4s, v9.4s\n"
    "fmin v5.4s, v5.4s, v29.4s\n"
    "ldr s12, [%[wbptr], #24]\n"
    "fmax v4.4s, v4.4s, v30.4s\n"
    "ldr s28, [%[inptr0], x9]\n"
    "str s5, [x25]\n"
    "fmax v3.4s, v3.4s, v30.4s\n"
    "fmin v4.4s, v4.4s, v29.4s\n"
    "ldr s9, [%[wbptr], #36]\n"
    "fmin v3.4s, v3.4s, v29.4s\n"
    "ldr s23, [x21]\n"
    "str s4, [x25, %[output_col_stride1]]\n"
    "fmax v2.4s, v2.4s, v30.4s\n"
    "str s3, [x25, x27]\n"
    "fmax v1.4s, v1.4s, v30.4s\n"
    "fmin v2.4s, v2.4s, v29.4s\n"
    "ldr s18, [x14, %[input_col_stride1]]\n"
    "fmin v1.4s, v1.4s, v29.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "str s2, [x13]\n"
    "fmax v0.4s, v0.4s, v30.4s\n"
    "str s1, [x13, %[output_col_stride1]]\n"
    "mov v8.16b, v19.16b\n"
    "fmin v0.4s, v0.4s, v29.4s\n"
    "add x25, x25, #4\n"
    "mov v5.16b, v19.16b\n"
    "mov v7.16b, v19.16b\n"
    "str s0, [x13, x27]\n"
    "mov v2.16b, v19.16b\n"
    "mov v4.16b, v19.16b\n"
    "add x13, x13, #4\n"
    "mov v6.16b, v19.16b\n"
    "mov v1.16b, v19.16b\n"
    "mov v3.16b, v19.16b\n"
    "mov v0.16b, v19.16b\n"
    "fmla v8.4s, v25.4s, v17.4s\n"
    "fmla v8.4s, v27.4s, v14.4s\n"
    "bne 5b\n"
    "6:\n"
    "fmla v5.4s, v27.4s, v17.4s\n"
    "ldr s27, [x17, x9]\n"
    "fmla v8.4s, v26.4s, v16.4s\n"
    "ldr s30, [%[inptr0], x15]\n"
    "fmla v7.4s, v26.4s, v17.4s\n"
    "ldr s31, [x26]\n"
    "fmla v5.4s, v20.4s, v14.4s\n"
    "ldr s24, [x21, %[input_col_stride1]]\n"
    "fmla v8.4s, v20.4s, v11.4s\n"
    "prfm pldl1keep, [x17, x22]\n"
    "fmla v2.4s, v20.4s, v17.4s\n"
    "ldr s29, [x14, x9]\n"
    "fmla v5.4s, v22.4s, v16.4s\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v8.4s, v22.4s, v13.4s\n"
    "prfm pldl1keep, [x26, #64]\n"
    "fmla v7.4s, v22.4s, v14.4s\n"
    "prfm pldl1keep, [x21, x28]\n"
    "fmla v4.4s, v22.4s, v17.4s\n"
    "ldr s21, [x17, x15]\n"
    "fmla v8.4s, v28.4s, v15.4s\n"
    "prfm pldl1keep, [x14, x22]\n"
    "fmla v7.4s, v28.4s, v16.4s\n"
    "prfm pldl1keep, [x17, x16]\n"
    "fmla v6.4s, v28.4s, v17.4s\n"
    "ldr s19, [%[inptr0], x24]\n"
    "fmla v5.4s, v23.4s, v11.4s\n"
    "prfm pldl1keep, [%[inptr0], x23]\n"
    "fmla v2.4s, v23.4s, v14.4s\n"
    "ldr s28, [x26, %[input_col_stride1]]\n"
    "fmla v8.4s, v18.4s, v10.4s\n"
    "prfm pldl1keep, [x26, x28]\n"
    "fmla v5.4s, v18.4s, v13.4s\n"
    "prfm pldl1keep, [x21, x22]\n"
    "fmla v7.4s, v18.4s, v11.4s\n"
    "prfm pldl1keep, [x14, x16]\n"
    "fmla v2.4s, v18.4s, v16.4s\n"
    "prfm pldl1keep, [x17, x23]\n"
    "fmla v4.4s, v18.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x22]\n"
    "fmla v1.4s, v18.4s, v17.4s\n"
    "ldr s25, [x21, x9]\n"
    "fmla v8.4s, v27.4s, v12.4s\n"
    "prfm pldl1keep, [x21, x16]\n"
    "fmla v5.4s, v27.4s, v15.4s\n"
    "prfm pldl1keep, [x14, x23]\n"
    "fmla v7.4s, v27.4s, v13.4s\n"
    "prfm pldl1keep, [x26, x16]\n"
    "fmla v4.4s, v27.4s, v16.4s\n"
    "prfm pldl1keep, [x21, x23]\n"
    "fmla v6.4s, v27.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x23]\n"
    "fmla v3.4s, v27.4s, v17.4s\n"
    "ldr s27, [x14, x15]\n"
    "fmla v7.4s, v30.4s, v15.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v6.4s, v30.4s, v16.4s\n"
    "ldr s26, [x17, x24]\n"
    "fmla v2.4s, v31.4s, v11.4s\n"
    "ldr s20, [x26, x9]\n"
    "fmla v5.4s, v24.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v4.4s, v24.4s, v11.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v2.4s, v24.4s, v13.4s\n"
    "add x17, x17, #4\n"
    "fmla v1.4s, v24.4s, v14.4s\n"
    "ldr s18, [x21, x15]\n"
    "fmla v8.4s, v29.4s, v9.4s\n"
    "fmla v5.4s, v29.4s, v12.4s\n"
    "fmla v7.4s, v29.4s, v10.4s\n"
    "fmla v2.4s, v29.4s, v15.4s\n"
    "fmla v4.4s, v29.4s, v13.4s\n"
    "fmla v6.4s, v29.4s, v11.4s\n"
    "fmla v1.4s, v29.4s, v16.4s\n"
    "fmla v3.4s, v29.4s, v14.4s\n"
    "fmla v0.4s, v29.4s, v17.4s\n"
    "ldr s22, [x14, x24]\n"
    "fmla v7.4s, v21.4s, v12.4s\n"
    "ldr s23, [x26, x15]\n"
    "fmla v4.4s, v21.4s, v15.4s\n"
    "add x14, x14, #4\n"
    "fmla v6.4s, v21.4s, v13.4s\n"
    "fmla v3.4s, v21.4s, v16.4s\n"
    "fmla v2.4s, v28.4s, v10.4s\n"
    "ldr s24, [x21, x24]\n"
    "fmla v1.4s, v28.4s, v11.4s\n"
    "ldr s21, [x26, x24]\n"
    "fmla v6.4s, v19.4s, v15.4s\n"
    "add x21, x21, #4\n"
    "fmla v5.4s, v25.4s, v9.4s\n"
    "add x26, x26, #4\n"
    "fmla v2.4s, v25.4s, v12.4s\n"
    "fmla v4.4s, v25.4s, v10.4s\n"
    "fmla v1.4s, v25.4s, v13.4s\n"
    "fmla v3.4s, v25.4s, v11.4s\n"
    "fmla v0.4s, v25.4s, v14.4s\n"
    "fmla v7.4s, v27.4s, v9.4s\n"
    "fmla v4.4s, v27.4s, v12.4s\n"
    "fmla v6.4s, v27.4s, v10.4s\n"
    "fmla v1.4s, v27.4s, v15.4s\n"
    "fmla v3.4s, v27.4s, v13.4s\n"
    "fmla v0.4s, v27.4s, v16.4s\n"
    "fmla v2.4s, v20.4s, v9.4s\n"
    "fmla v6.4s, v26.4s, v12.4s\n"
    "fmla v4.4s, v18.4s, v9.4s\n"
    "fmla v3.4s, v26.4s, v15.4s\n"
    "fmla v1.4s, v20.4s, v10.4s\n"
    "fmla v0.4s, v20.4s, v11.4s\n"
    "movi v30.16b, #0\n"
    "fmla v6.4s, v22.4s, v9.4s\n"
    "fmov v29.4s, #6.0\n"
    "fmla v1.4s, v18.4s, v12.4s\n"
    "fmla v3.4s, v18.4s, v10.4s\n"
    "fmla v0.4s, v18.4s, v13.4s\n"
    "fmax v8.4s, v8.4s, v30.4s\n"
    "fmax v7.4s, v7.4s, v30.4s\n"
    "fmax v6.4s, v6.4s, v30.4s\n"
    "fmla v3.4s, v22.4s, v12.4s\n"
    "fmla v1.4s, v23.4s, v9.4s\n"
    "fmla v0.4s, v22.4s, v15.4s\n"
    "fmin v8.4s, v8.4s, v29.4s\n"
    "fmin v7.4s, v7.4s, v29.4s\n"
    "fmin v6.4s, v6.4s, v29.4s\n"
    "str s8, [%[outptr0]]\n"
    "fmla v3.4s, v24.4s, v9.4s\n"
    "str s7, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v0.4s, v23.4s, v10.4s\n"
    "str s6, [%[outptr0], x27]\n"
    "fmax v5.4s, v5.4s, v30.4s\n"
    "fmax v4.4s, v4.4s, v30.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "fmla v0.4s, v24.4s, v12.4s\n"
    "fmin v5.4s, v5.4s, v29.4s\n"
    "fmin v4.4s, v4.4s, v29.4s\n"
    "fmax v3.4s, v3.4s, v30.4s\n"
    "str s5, [x25]\n"
    "fmax v2.4s, v2.4s, v30.4s\n"
    "str s4, [x25, %[output_col_stride1]]\n"
    "fmla v0.4s, v21.4s, v9.4s\n"
    "fmin v3.4s, v3.4s, v29.4s\n"
    "fmin v2.4s, v2.4s, v29.4s\n"
    "fmax v1.4s, v1.4s, v30.4s\n"
    "str s3, [x25, x27]\n"
    "str s2, [x13]\n"
    "fmin v1.4s, v1.4s, v29.4s\n"
    "fmax v0.4s, v0.4s, v30.4s\n"
    "add x25, x25, #4\n"
    "str s1, [x13, %[output_col_stride1]]\n"
    "fmin v0.4s, v0.4s, v29.4s\n"
    "str s0, [x13, x27]\n"
    "add x13, x13, #4\n"
    "7:\n"
    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
    : [input_col_stride1] "r" (input_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x13", "x14", "x15", "x16", "x17", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
  );
}

#endif  // __aarch64__

template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float, float>;

}  // namespace depthwise
